From 1b241df5c04f8eb3d1ef9c7377306f64fb663459 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 20 Oct 2014 19:01:00 +0200
Subject: [PATCH] cubehash and luffa funnel shit (from klaus)

No gain... but i like this define, more readable in luffa ;)
---
 x11/cuda_x11_cubehash512.cu | 17 +++++++++++++----
 x11/cuda_x11_luffa512.cu    | 25 ++++++++++++++-----------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu
index 76ad2e9..5f77b0b 100644
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@@ -8,8 +8,15 @@ typedef unsigned char BitSequence;
 #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
 #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
 
-#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
-#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
+
 #define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
 
 __device__ __constant__
@@ -278,11 +285,13 @@ __global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, u
 
 
 // Setup-Funktionen
-__host__ void x11_cubehash512_cpu_init(int thr_id, int threads)
+__host__
+void x11_cubehash512_cpu_init(int thr_id, int threads)
 {
 }
 
-__host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
     const int threadsperblock = 256;
 
diff --git a/x11/cuda_x11_luffa512.cu b/x11/cuda_x11_luffa512.cu
index 72deb5f..c6dd1dc 100644
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@@ -20,9 +20,6 @@
 
 #include "cuda_helper.h"
 
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 typedef unsigned char BitSequence;
 
 typedef struct {
@@ -41,11 +38,17 @@ typedef struct {
     a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
     a[0+(8*j)] = tmp;
 
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
 #define TWEAK(a0,a1,a2,a3,j)\
-    a0 = (a0<<(j))|(a0>>(32-j));\
-    a1 = (a1<<(j))|(a1>>(32-j));\
-    a2 = (a2<<(j))|(a2>>(32-j));\
-    a3 = (a3<<(j))|(a3>>(32-j));
+    a0 = LROT(a0,j);\
+    a1 = LROT(a1,j);\
+    a2 = LROT(a2,j);\
+    a3 = LROT(a3,j);
 
 #define STEP(c0,c1)\
     SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
@@ -77,13 +80,13 @@ typedef struct {
 
 #define MIXWORD(a0,a4)\
     a4 ^= a0;\
-    a0  = (a0<<2) | (a0>>(30));\
+    a0  = LROT(a0,2);\
     a0 ^= a4;\
-    a4  = (a4<<14) | (a4>>(18));\
+    a4  = LROT(a4,14);\
     a4 ^= a0;\
-    a0  = (a0<<10) | (a0>>(22));\
+    a0  = LROT(a0,10);\
     a0 ^= a4;\
-    a4  = (a4<<1) | (a4>>(31));
+    a4  = LROT(a4,1);
 
 #define ADD_CONSTANT(a0,b0,c0,c1)\
     a0 ^= c0;\