From 38e6672d7032ddd8dff326f8d21f084b0c487811 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 28 Mar 2015 10:09:55 +0100 Subject: [PATCH] Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... --- Algo256/cuda_skein256.cu | 4 ++-- cuda_checkhash.cu | 27 ++++++++++++++++++++++++++- cuda_helper.h | 3 +++ quark/cuda_quark_groestl512.cu | 28 +++++++++++++++------------- quark/cuda_quark_keccak512.cu | 4 +++- quark/cuda_skein512.cu | 12 ++++++++---- 6 files changed, 57 insertions(+), 21 deletions(-) diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu index 71687dc..bc9acef 100644 --- a/Algo256/cuda_skein256.cu +++ b/Algo256/cuda_skein256.cu @@ -295,7 +295,7 @@ void skein256_gpu_hash_32_v30(uint32_t threads, uint32_t startNounce, uint64_t * __host__ void skein256_cpu_init(int thr_id, uint32_t threads) { - //empty + cuda_get_arch(thr_id); } __host__ @@ -306,7 +306,7 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - if (device_sm[device_map[thr_id]] >= 320) + if (device_sm[device_map[thr_id]] > 300 && cuda_arch[device_map[thr_id]] > 300) skein256_gpu_hash_32<<>>(threads, startNounce, d_outputHash); else skein256_gpu_hash_32_v30<<>>(threads, startNounce, d_outputHash); diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index 85c2c04..4bbef59 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -195,4 +195,29 @@ uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNoun result = *h_resNonces[thr_id]; return result; -} \ No newline at end of file +} + +/* Function to get the compiled Shader Model version */ +int cuda_arch[MAX_GPUS] = { 0 }; +__global__ +void nvcc_get_arch(int *d_version) +{ +#ifdef __CUDA_ARCH__ + *d_version = __CUDA_ARCH__; +#endif +} + +__host__ +int cuda_get_arch(int thr_id) +{ + int *d_version; + int dev_id = device_map[thr_id]; + if (cuda_arch[dev_id] == 0) { + // only do it once... + cudaMalloc(&d_version, sizeof(int)); + nvcc_get_arch <<< 1, 1 >>> (d_version); + cudaMemcpy(&cuda_arch[dev_id], d_version, sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(d_version); + } + return cuda_arch[dev_id]; +} diff --git a/cuda_helper.h b/cuda_helper.h index 6b1ce24..355c098 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -20,7 +20,10 @@ extern "C" short device_map[MAX_GPUS]; extern "C" long device_sm[MAX_GPUS]; +extern int cuda_arch[MAX_GPUS]; + // common functions +extern int cuda_get_arch(int thr_id); extern void cuda_check_cpu_init(int thr_id, uint32_t threads); extern void cuda_check_cpu_setTarget(const void *ptarget); extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash); diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index ba69214..3ec7f9b 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -133,15 +133,17 @@ __global__ void __launch_bounds__(TPB, THF) #endif } - - -__host__ void quark_groestl512_cpu_init(int thr_id, uint32_t threads) +__host__ +void quark_groestl512_cpu_init(int thr_id, uint32_t threads) { - if (device_sm[device_map[thr_id]] < 300) + int dev_id = device_map[thr_id]; + cuda_get_arch(thr_id); + if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) quark_groestl512_sm20_init(thr_id, threads); } -__host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ +void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { int threadsperblock = TPB; @@ -153,11 +155,10 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_ dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; + int dev_id = device_map[thr_id]; - if (device_sm[device_map[thr_id]] >= 300) - quark_groestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); + if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) + quark_groestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); else quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order); @@ -165,7 +166,8 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_ MyStreamSynchronize(NULL, order, thr_id); } -__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ +void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const int factor = THF; int threadsperblock = TPB; @@ -173,10 +175,10 @@ __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, u dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock)); dim3 block(threadsperblock); - size_t shared_size = 0; + int dev_id = device_map[thr_id]; - if (device_sm[device_map[thr_id]] >= 300) - quark_doublegroestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); + if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) + quark_doublegroestl512_gpu_hash_64_quad<<>>(threads, startNounce, d_hash, d_nonceVector); else quark_doublegroestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order); diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu index 5d7dcd7..21ee856 100644 --- a/quark/cuda_quark_keccak512.cu +++ b/quark/cuda_quark_keccak512.cu @@ -248,7 +248,9 @@ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - if (device_sm[device_map[thr_id]] >= 320) + int dev_id = device_map[thr_id]; + + if (device_sm[dev_id] >= 320) quark_keccak512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); else quark_keccak512_gpu_hash_64_v30<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index 3de4309..f472087 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -11,7 +11,7 @@ static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes #define SHL(x, n) ((x) << (n)) #define SHR(x, n) ((x) >> (n)) -#if __CUDA_ARCH__ >= 320 +#if __CUDA_ARCH__ > 300 __device__ uint64_t skein_rotl64(const uint64_t x, const int offset) { @@ -443,11 +443,11 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); // fertig - uint64_t *outpHash = &g_hash[8 * hashPosition]; + uint2 *outpHash = (uint2*) (&g_hash[hashPosition * 8]); #pragma unroll 8 for(int i=0; i<8; i++) - outpHash[i] = devectorize(p[i]); + outpHash[i] = p[i]; } } @@ -724,6 +724,8 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp __host__ void quark_skein512_cpu_init(int thr_id, uint32_t threads) { + // store the binary SM version + cuda_get_arch(thr_id); } __host__ @@ -734,8 +736,10 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); + int dev_id = device_map[thr_id]; + // uint2 uint64 variants for SM 3.2+ - if (device_sm[device_map[thr_id]] >= 320) + if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) quark_skein512_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); else quark_skein512_gpu_hash_64_v30 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);