From a747e4ca0f329b049a1ba6cbe88dc9c7bc613aa5 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 9 Nov 2014 01:13:28 +0100 Subject: [PATCH] blake512: use a new SWAPDWORDS asm func (0.05ms) small improvement, do it on pentablake and heavy variants too based on sp commit (but SWAP32 is already used for 32bit ints) --- cuda_helper.h | 19 ++++++++++++++++--- heavy/cuda_blake512.cu | 2 +- pentablake.cu | 2 +- quark/cuda_bmw512.cu | 14 ++++++++------ quark/cuda_quark_blake512.cu | 3 +-- 5 files changed, 27 insertions(+), 13 deletions(-) diff --git a/cuda_helper.h b/cuda_helper.h index 03e38cb..21f1b1d 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -36,7 +36,7 @@ extern const uint3 threadIdx; #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) -#if __CUDA_ARCH__ < 350 +#if __CUDA_ARCH__ < 320 // Kepler (Compute 3.0) #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #else @@ -253,7 +253,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n) #endif // 64-bit ROTATE RIGHT -#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1 +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ __device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) { @@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset) #endif // 64-bit ROTATE LEFT -#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1 +#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1 __device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) { uint2 result; @@ -342,4 +342,17 @@ uint64_t ROTL64(const uint64_t x, const int offset) #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #endif +__device__ __forceinline__ +uint64_t SWAPDWORDS(const uint64_t value) +{ +#if __CUDA_ARCH__ >= 320 + uint2 temp; + asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value)); + asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x)); + return value; +#else + return ROTL64(value, 32); +#endif +} + #endif // #ifndef CUDA_HELPER_H diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu index ea4b82b..fe58bc0 100644 --- a/heavy/cuda_blake512.cu +++ b/heavy/cuda_blake512.cu @@ -66,7 +66,7 @@ const uint64_t host_u512[16] = #define G(a,b,c,d,e) \ v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ - v[d] = ROTR64( v[d] ^ v[a],32); \ + v[d] = SWAPDWORDS( v[d] ^ v[a]); \ v[c] += v[d]; \ v[b] = ROTR64( v[b] ^ v[c],25); \ v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ diff --git a/pentablake.cu b/pentablake.cu index 9958e53..344cbdd 100644 --- a/pentablake.cu +++ b/pentablake.cu @@ -112,7 +112,7 @@ const uint64_t c_u512[16] = uint32_t idx1 = c_sigma[i][x]; \ uint32_t idx2 = c_sigma[i][x+1]; \ v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \ - v[d] = ROTR64(v[d] ^ v[a], 32); \ + v[d] = SWAPDWORDS(v[d] ^ v[a]); \ v[c] += v[d]; \ v[b] = ROTR64(v[b] ^ v[c], 25); \ v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \ diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index ea10918..f037b04 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -12,12 +12,14 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi #define SHL(x, n) ((x) << (n)) #define SHR(x, n) ((x) >> (n)) -#define CONST_EXP2 q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \ - q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + ROTL64(q[i+7], 32) + \ - q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \ - q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) - -__device__ void Compression512(uint64_t *msg, uint64_t *hash) +#define CONST_EXP2 \ + q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \ + q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPDWORDS(q[i+7]) + \ + q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \ + q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) + +__device__ +void Compression512(uint64_t *msg, uint64_t *hash) { // Compression ref. implementation uint64_t tmp; diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 39e633b..1231f7a 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -7,7 +7,6 @@ #define USE_SHUFFLE 0 -// die Message it Padding zur Berechnung auf der GPU __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------ @@ -51,7 +50,7 @@ const uint64_t c_u512[16] = uint32_t idx1 = sigma[i][x]; \ uint32_t idx2 = sigma[i][x+1]; \ v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \ - v[d] = ROTR( v[d] ^ v[a], 32); \ + v[d] = SWAPDWORDS(v[d] ^ v[a]); \ v[c] += v[d]; \ v[b] = ROTR( v[b] ^ v[c], 25); \ v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \