From 957d919a6a7b22910a7ff4945a2ab993f84a10cb Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 24 Oct 2015 07:20:39 +0200 Subject: [PATCH] bmw512: save a few KBs, ifdef 80-bytes kernel was only used by animecoin Also ifdef SM 3.0 compat. code to be ignored on recent archs --- quark/cuda_bmw512.cu | 41 +++++++++++++++++++++++++---------------- quark/cuda_bmw512_30.cu | 14 +++++++++++++- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index bcf3c46..1c7180c 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -1,9 +1,13 @@ #include #include +#undef WANT_BMW512_80 + #include "cuda_helper.h" +#ifdef WANT_BMW512_80 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) +#endif #include "cuda_bmw512_30.cu" @@ -21,7 +25,8 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi q[i+8] + ROL(q[i+9], 37) + q[i+10] + ROL(q[i+11], 43) + \ q[i+12] + ROL(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) -__device__ void Compression512_64_first(uint2 *msg, uint2 *hash) +__device__ +void Compression512_64_first(uint2 *msg, uint2 *hash) { // Compression ref. implementation uint2 q[32]; @@ -380,6 +385,8 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * } } +#ifdef WANT_BMW512_80 + __global__ __launch_bounds__(256, 2) void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { @@ -436,12 +443,6 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t * } } -__host__ -void quark_bmw512_cpu_init(int thr_id, uint32_t threads) -{ - cuda_get_arch(thr_id); -} - __host__ void quark_bmw512_cpu_setBlock_80(void *pdata) { @@ -455,29 +456,37 @@ void quark_bmw512_cpu_setBlock_80(void *pdata) } __host__ -void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) { - const uint32_t threadsperblock = 32; + const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - int dev_id = device_map[thr_id]; + if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) - quark_bmw512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + quark_bmw512_gpu_hash_80<<>>(threads, startNounce, (uint64_t*)d_hash); else - quark_bmw512_gpu_hash_64_30<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + quark_bmw512_gpu_hash_80_30<<>>(threads, startNounce, (uint64_t*)d_hash); } +#endif + __host__ -void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) +void quark_bmw512_cpu_init(int thr_id, uint32_t threads) { - const uint32_t threadsperblock = 128; + cuda_get_arch(thr_id); +} + +__host__ +void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +{ + const uint32_t threadsperblock = 32; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); int dev_id = device_map[thr_id]; if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) - quark_bmw512_gpu_hash_80<<>>(threads, startNounce, (uint64_t*)d_hash); + quark_bmw512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); else - quark_bmw512_gpu_hash_80_30<<>>(threads, startNounce, (uint64_t*)d_hash); + quark_bmw512_gpu_hash_64_30<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); } diff --git a/quark/cuda_bmw512_30.cu b/quark/cuda_bmw512_30.cu index e0edf81..5b204e3 100644 --- a/quark/cuda_bmw512_30.cu +++ b/quark/cuda_bmw512_30.cu @@ -12,7 +12,9 @@ q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \ q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) -static __constant__ uint64_t d_constMem[16] = { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 500 + +__constant__ uint64_t d_constMem[16] = { SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), SPH_C64(0x9091929394959697), @@ -213,6 +215,8 @@ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_ } } +#ifdef WANT_BMW512_80 + __global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { @@ -250,3 +254,11 @@ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_ outpHash[i] = ((uint2*)message)[i+8]; } } + +#endif + +#else /* stripped stubs for other archs */ +__global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {} +__global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} +#endif +