From a747e4ca0f329b049a1ba6cbe88dc9c7bc613aa5 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 9 Nov 2014 01:13:28 +0100
Subject: [PATCH] blake512: use a new SWAPDWORDS asm func (0.05ms)

small improvement, do it on pentablake and heavy variants too

based on sp commit (but SWAP32 is already used for 32bit ints)
---
 cuda_helper.h                | 19 ++++++++++++++++---
 heavy/cuda_blake512.cu       |  2 +-
 pentablake.cu                |  2 +-
 quark/cuda_bmw512.cu         | 14 ++++++++------
 quark/cuda_quark_blake512.cu |  3 +--
 5 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/cuda_helper.h b/cuda_helper.h
index 03e38cb..21f1b1d 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -36,7 +36,7 @@ extern const uint3 threadIdx;
 
 #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
 
-#if __CUDA_ARCH__ < 350
+#if __CUDA_ARCH__ < 320
 // Kepler (Compute 3.0)
 #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
 #else
@@ -253,7 +253,7 @@ uint64_t shl_t64(uint64_t x, uint32_t n)
 #endif
 
 // 64-bit ROTATE RIGHT
-#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
 /* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */
 __device__ __forceinline__
 uint64_t ROTR64(const uint64_t value, const int offset) {
@@ -289,7 +289,7 @@ uint64_t ROTR64(const uint64_t x, const int offset)
 #endif
 
 // 64-bit ROTATE LEFT
-#if __CUDA_ARCH__ >= 350 && USE_ROT_ASM_OPT == 1
+#if __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 1
 __device__ __forceinline__
 uint64_t ROTL64(const uint64_t value, const int offset) {
 	uint2 result;
@@ -342,4 +342,17 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 #define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
 #endif
 
+__device__ __forceinline__
+uint64_t SWAPDWORDS(const uint64_t value)
+{
+#if __CUDA_ARCH__ >= 320
+	uint2 temp;
+	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
+	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
+	return value;
+#else
+	return ROTL64(value, 32);
+#endif
+}
+
 #endif // #ifndef CUDA_HELPER_H
diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu
index ea4b82b..fe58bc0 100644
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@@ -66,7 +66,7 @@ const uint64_t host_u512[16] =
 
 #define G(a,b,c,d,e)          \
     v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = ROTR64( v[d] ^ v[a],32);        \
+    v[d] = SWAPDWORDS( v[d] ^ v[a]);        \
     v[c] += v[d];           \
     v[b] = ROTR64( v[b] ^ v[c],25);        \
     v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
diff --git a/pentablake.cu b/pentablake.cu
index 9958e53..344cbdd 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -112,7 +112,7 @@ const uint64_t c_u512[16] =
 	uint32_t idx1 = c_sigma[i][x]; \
 	uint32_t idx2 = c_sigma[i][x+1]; \
 	v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
-	v[d] = ROTR64(v[d] ^ v[a], 32); \
+	v[d] = SWAPDWORDS(v[d] ^ v[a]); \
 	v[c] += v[d]; \
 	v[b] = ROTR64(v[b] ^ v[c], 25); \
 	v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \
diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu
index ea10918..f037b04 100644
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@@ -12,12 +12,14 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi
 #define SHL(x, n)            ((x) << (n))
 #define SHR(x, n)            ((x) >> (n))
 
-#define CONST_EXP2    q[i+0] + ROTL64(q[i+1], 5)  + q[i+2] + ROTL64(q[i+3], 11) + \
-                    q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + ROTL64(q[i+7], 32) + \
-                    q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
-                    q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
-
-__device__ void Compression512(uint64_t *msg, uint64_t *hash)
+#define CONST_EXP2 \
+    q[i+0] + ROTL64(q[i+1], 5)  + q[i+2] + ROTL64(q[i+3], 11) + \
+    q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + SWAPDWORDS(q[i+7]) + \
+    q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
+    q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
+
+__device__
+void Compression512(uint64_t *msg, uint64_t *hash)
 {
     // Compression ref. implementation
     uint64_t tmp;
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index 39e633b..1231f7a 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -7,7 +7,6 @@
 
 #define USE_SHUFFLE 0
 
-// die Message it Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 
 // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
@@ -51,7 +50,7 @@ const uint64_t c_u512[16] =
 	uint32_t idx1 = sigma[i][x]; \
 	uint32_t idx2 = sigma[i][x+1]; \
 	v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
-	v[d] = ROTR( v[d] ^ v[a], 32); \
+	v[d] = SWAPDWORDS(v[d] ^ v[a]); \
 	v[c] += v[d]; \
 	v[b] = ROTR( v[b] ^ v[c], 25); \
 	v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \