From 957d919a6a7b22910a7ff4945a2ab993f84a10cb Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 24 Oct 2015 07:20:39 +0200
Subject: [PATCH] bmw512: save a few KBs, ifdef 80-bytes kernel

was only used by animecoin

Also ifdef SM 3.0 compat. code to be ignored on recent archs
---
 quark/cuda_bmw512.cu    | 41 +++++++++++++++++++++++++----------------
 quark/cuda_bmw512_30.cu | 14 +++++++++++++-
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu
index bcf3c46..1c7180c 100644
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@@ -1,9 +1,13 @@
 #include <stdio.h>
 #include <memory.h>
 
+#undef WANT_BMW512_80
+
 #include "cuda_helper.h"
 
+#ifdef WANT_BMW512_80
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+#endif
 
 #include "cuda_bmw512_30.cu"
 
@@ -21,7 +25,8 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi
 	q[i+8] + ROL(q[i+9], 37) + q[i+10] + ROL(q[i+11], 43) + \
 	q[i+12] + ROL(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
 
-__device__ void Compression512_64_first(uint2 *msg, uint2 *hash)
+__device__
+void Compression512_64_first(uint2 *msg, uint2 *hash)
 {
 	// Compression ref. implementation
 	uint2 q[32];
@@ -380,6 +385,8 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 	}
 }
 
+#ifdef WANT_BMW512_80
+
 __global__ __launch_bounds__(256, 2)
 void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
@@ -436,12 +443,6 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *
 	}
 }
 
-__host__
-void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
-{
-	cuda_get_arch(thr_id);
-}
-
 __host__
 void quark_bmw512_cpu_setBlock_80(void *pdata)
 {
@@ -455,29 +456,37 @@ void quark_bmw512_cpu_setBlock_80(void *pdata)
 }
 
 __host__
-void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
 {
-	const uint32_t threadsperblock = 32;
+	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-
 	int dev_id = device_map[thr_id];
+
 	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
-		quark_bmw512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+		quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
 	else
-		quark_bmw512_gpu_hash_64_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+		quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
 }
 
+#endif
+
 __host__
-void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
+void quark_bmw512_cpu_init(int thr_id, uint32_t threads)
 {
-	const uint32_t threadsperblock = 128;
+	cuda_get_arch(thr_id);
+}
+
+__host__
+void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+{
+	const uint32_t threadsperblock = 32;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
 	int dev_id = device_map[thr_id];
 	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
-		quark_bmw512_gpu_hash_80<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+		quark_bmw512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	else
-		quark_bmw512_gpu_hash_80_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+		quark_bmw512_gpu_hash_64_30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 }
diff --git a/quark/cuda_bmw512_30.cu b/quark/cuda_bmw512_30.cu
index e0edf81..5b204e3 100644
--- a/quark/cuda_bmw512_30.cu
+++ b/quark/cuda_bmw512_30.cu
@@ -12,7 +12,9 @@
 	q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
 	q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
 
-static __constant__ uint64_t d_constMem[16] = {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 500
+
+__constant__ uint64_t d_constMem[16] = {
 	SPH_C64(0x8081828384858687),
 	SPH_C64(0x88898A8B8C8D8E8F),
 	SPH_C64(0x9091929394959697),
@@ -213,6 +215,8 @@ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_
 	}
 }
 
+#ifdef WANT_BMW512_80
+
 __global__
 void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
@@ -250,3 +254,11 @@ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_
 			outpHash[i] = ((uint2*)message)[i+8];
 	}
 }
+
+#endif
+
+#else /* stripped stubs for other archs */
+__global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
+__global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
+#endif
+