Browse Source

blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too

based on sp commit (but SWAP32 is already used for 32bit ints)
master
Tanguy Pruvot 10 years ago
parent
commit
a747e4ca0f
  1. 19
      cuda_helper.h
  2. 2
      heavy/cuda_blake512.cu
  3. 2
      pentablake.cu
  4. 14
      quark/cuda_bmw512.cu
  5. 3
      quark/cuda_quark_blake512.cu

19
cuda_helper.h

@ -36,7 +36,7 @@ extern const uint3 threadIdx;
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350 #if __CUDA_ARCH__ < 320
// Kepler (Compute 3.0) // Kepler (Compute 3.0)
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#else #else
@ -253,7 +253,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
#endif #endif
// 64-bit ROTATE RIGHT // 64-bit ROTATE RIGHT
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1 #if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
__device__ __forceinline__ __device__ __forceinline__
uint64_t ROTR64(const uint64_t value, const int offset) { uint64_t ROTR64(const uint64_t value, const int offset) {
@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
#endif #endif
// 64-bit ROTATE LEFT // 64-bit ROTATE LEFT
#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1 #if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
__device__ __forceinline__ __device__ __forceinline__
uint64_t ROTL64(const uint64_t value, const int offset) { uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result; uint2 result;
@ -342,4 +342,17 @@ uint64_t ROTL64(const uint64_t x, const int offset)
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif #endif
__device__ __forceinline__
uint64_t SWAPDWORDS(const uint64_t value)
{
#if __CUDA_ARCH__ >= 320
uint2 temp;
asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
return value;
#else
return ROTL64(value, 32);
#endif
}
#endif // #ifndef CUDA_HELPER_H #endif // #ifndef CUDA_HELPER_H

2
heavy/cuda_blake512.cu

@ -66,7 +66,7 @@ const uint64_t host_u512[16] =
#define G(a,b,c,d,e) \ #define G(a,b,c,d,e) \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
v[d] = ROTR64( v[d] ^ v[a],32); \ v[d] = SWAPDWORDS( v[d] ^ v[a]); \
v[c] += v[d]; \ v[c] += v[d]; \
v[b] = ROTR64( v[b] ^ v[c],25); \ v[b] = ROTR64( v[b] ^ v[c],25); \
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \

2
pentablake.cu

@ -112,7 +112,7 @@ const uint64_t c_u512[16] =
uint32_t idx1 = c_sigma[i][x]; \ uint32_t idx1 = c_sigma[i][x]; \
uint32_t idx2 = c_sigma[i][x+1]; \ uint32_t idx2 = c_sigma[i][x+1]; \
v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \ v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
v[d] = ROTR64(v[d] ^ v[a], 32); \ v[d] = SWAPDWORDS(v[d] ^ v[a]); \
v[c] += v[d]; \ v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 25); \ v[b] = ROTR64(v[b] ^ v[c], 25); \
v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \ v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \

14
quark/cuda_bmw512.cu

@ -12,12 +12,14 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi
#define SHL(x, n) ((x) << (n)) #define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n)) #define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \ #define CONST_EXP2 \
q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + ROTL64(q[i+7], 32) + \ q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \ q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPDWORDS(q[i+7]) + \
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__device__ void Compression512(uint64_t *msg, uint64_t *hash)
__device__
void Compression512(uint64_t *msg, uint64_t *hash)
{ {
// Compression ref. implementation // Compression ref. implementation
uint64_t tmp; uint64_t tmp;

3
quark/cuda_quark_blake512.cu

@ -7,7 +7,6 @@
#define USE_SHUFFLE 0 #define USE_SHUFFLE 0
// die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------ // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
@ -51,7 +50,7 @@ const uint64_t c_u512[16] =
uint32_t idx1 = sigma[i][x]; \ uint32_t idx1 = sigma[i][x]; \
uint32_t idx2 = sigma[i][x+1]; \ uint32_t idx2 = sigma[i][x+1]; \
v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \ v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
v[d] = ROTR( v[d] ^ v[a], 32); \ v[d] = SWAPDWORDS(v[d] ^ v[a]); \
v[c] += v[d]; \ v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c], 25); \ v[b] = ROTR( v[b] ^ v[c], 25); \
v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \ v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \

Loading…
Cancel
Save