blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too based on sp commit (but SWAP32 is already used for 32bit ints)
2014-11-09 01:13:28 +01:00 · 2014-11-09 01:13:28 +01:00 · a747e4ca0f
commit a747e4ca0f
parent 2d98d127f8
5 changed files with 26 additions and 12 deletions
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -36,7 +36,7 @@ extern const uint3 threadIdx;

 #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))

-#if __CUDA_ARCH__ < 350
+#if __CUDA_ARCH__ < 320
 // Kepler (Compute 3.0)
 #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
 #else
@ -253,7 +253,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
 #endif

 // 64-bit ROTATE RIGHT
-#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
 /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
 __device__ __forceinline__
 uint64_t ROTR64(const uint64_t value, const int offset) {
@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
 #endif

 // 64-bit ROTATE LEFT
-#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
 __device__ __forceinline__
 uint64_t ROTL64(const uint64_t value, const int offset) {
 	uint2 result;
@ -342,4 +342,17 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 #define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
 #endif

+__device__ __forceinline__
+uint64_t SWAPDWORDS(const uint64_t value)
+{
+#if __CUDA_ARCH__ >= 320
+	uint2 temp;
+	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
+	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
+	return value;
+#else
+	return ROTL64(value, 32);
+#endif
+}
+
 #endif // #ifndef CUDA_HELPER_H
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@ -66,7 +66,7 @@ const uint64_t host_u512[16] =

 #define G(a,b,c,d,e)          \
    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = ROTR64( v[d] ^ v[a],32);        \
+    v[d] = SWAPDWORDS( v[d] ^ v[a]);        \
    v[c] += v[d];           \
    v[b] = ROTR64( v[b] ^ v[c],25);        \
    v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
--- a/pentablake.cu
+++ b/pentablake.cu
@ -112,7 +112,7 @@ const uint64_t c_u512[16] =
 	uint32_t idx1 = c_sigma[i][x]; \
 	uint32_t idx2 = c_sigma[i][x+1]; \
 	v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
-	v[d] = ROTR64(v[d] ^ v[a], 32); \
+	v[d] = SWAPDWORDS(v[d] ^ v[a]); \
 	v[c] += v[d]; \
 	v[b] = ROTR64(v[b] ^ v[c], 25); \
 	v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@ -12,12 +12,14 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi
 #define SHL(x, n)            ((x) << (n))
 #define SHR(x, n)            ((x) >> (n))

-#define CONST_EXP2    q[i+0] + ROTL64(q[i+1], 5)  + q[i+2] + ROTL64(q[i+3], 11) + \
-                    q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + ROTL64(q[i+7], 32) + \
-                    q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
-                    q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+#define CONST_EXP2 \
+    q[i+0] + ROTL64(q[i+1], 5)  + q[i+2] + ROTL64(q[i+3], 11) + \
+    q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPDWORDS(q[i+7]) + \
+    q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
+    q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])

-__device__ void Compression512(uint64_t *msg, uint64_t *hash)
+__device__
+void Compression512(uint64_t *msg, uint64_t *hash)
 {
    // Compression ref. implementation
    uint64_t tmp;
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -7,7 +7,6 @@

 #define USE_SHUFFLE 0

-// die Message it Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)

 // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
@ -51,7 +50,7 @@ const uint64_t c_u512[16] =
 	uint32_t idx1 = sigma[i][x]; \
 	uint32_t idx2 = sigma[i][x+1]; \
 	v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
-	v[d] = ROTR( v[d] ^ v[a], 32); \
+	v[d] = SWAPDWORDS(v[d] ^ v[a]); \
 	v[c] += v[d]; \
 	v[b] = ROTR( v[b] ^ v[c], 25); \
 	v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \