From e50556b63749e8b68be96f295b5a8703dc1e59ed Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 4 Nov 2015 13:31:15 +0100 Subject: [PATCH] various changes, cleanup for the release small fixes to handle better the multi thread per gpu explicitly report than quark is not compatible with SM 2.1 (compact shuffle) --- Algo256/cuda_fugue256.cu | 18 ++----- Algo256/cuda_groestl256.cu | 9 ---- README.txt | 2 +- ccminer.cpp | 17 +++++-- cuda_checkhash.cu | 14 ++++-- cuda_fugue256.h | 6 +-- fuguecoin.cpp | 13 +---- lyra2/lyra2REv2.cu | 3 +- quark/cuda_quark_compactionTest.cu | 76 ++++++++++++++++-------------- quark/quarkcoin.cu | 9 ++++ x13/cuda_x13_fugue512.cu | 5 -- 11 files changed, 84 insertions(+), 88 deletions(-) diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu index a5b6628..5c9d687 100644 --- a/Algo256/cuda_fugue256.cu +++ b/Algo256/cuda_fugue256.cu @@ -724,14 +724,13 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp __host__ void fugue256_cpu_init(int thr_id, uint32_t threads) { - // Kopiere die Hash-Tabellen in den GPU-Speicher + // Link the hash tables in the GPU texDef(0, mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); texDef(1, mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256); texDef(2, mixTab2Tex, mixTab2m, mixtab2_cpu, sizeof(uint32_t)*256); texDef(3, mixTab3Tex, mixTab3m, mixtab3_cpu, sizeof(uint32_t)*256); - // Speicher für alle Ergebnisse belegen - cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads); + CUDA_SAFE_CALL(cudaMalloc(&d_fugue256_hashoutput[thr_id], (size_t) 32 * threads)); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); } @@ -741,11 +740,6 @@ void fugue256_cpu_free(int thr_id) cudaFree(d_fugue256_hashoutput[thr_id]); cudaFree(d_resultNonce[thr_id]); - cudaUnbindTexture(mixTab0Tex); - cudaUnbindTexture(mixTab1Tex); - cudaUnbindTexture(mixTab2Tex); - cudaUnbindTexture(mixTab3Tex); - for (int i=0; i<4; i++) cudaFree(d_textures[thr_id][i]); } @@ -753,20 +747,18 @@ void fugue256_cpu_free(int thr_id) __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) { - // CPU-Vorbereitungen treffen sph_fugue256_context ctx_fugue_const; sph_fugue256_init(&ctx_fugue_const); - sph_fugue256 (&ctx_fugue_const, data, 80); // State speichern - + sph_fugue256 (&ctx_fugue_const, data, 80); cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30); - cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8); + cudaMemcpyToSymbol(pTarget, pTargetIn, 32); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); } __host__ -void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce) +void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) { #if USE_SHARED const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu index 604302a..5d796e2 100644 --- a/Algo256/cuda_groestl256.cu +++ b/Algo256/cuda_groestl256.cu @@ -283,15 +283,6 @@ void groestl256_cpu_init(int thr_id, uint32_t threads) __host__ void groestl256_cpu_free(int thr_id) { - cudaUnbindTexture(t0up2); - cudaUnbindTexture(t0dn2); - cudaUnbindTexture(t1up2); - cudaUnbindTexture(t1dn2); - cudaUnbindTexture(t2up2); - cudaUnbindTexture(t2dn2); - cudaUnbindTexture(t3up2); - cudaUnbindTexture(t3dn2); - for (int i=0; i<8; i++) cudaFree(d_textures[thr_id][i]); diff --git a/README.txt b/README.txt index 107248e..0723f05 100644 --- a/README.txt +++ b/README.txt @@ -228,7 +228,7 @@ features. >>> RELEASE HISTORY <<< - Nov. 02nd 2015 v1.7 + Nov. 05th 2015 v1.7 Improve old devices compatibility (x11, lyra2, qubit...) Add windows support for SM 2.1 and drop SM 3.5 (x86) Improve lyra2 (v1/v2) cuda implementations diff --git a/ccminer.cpp b/ccminer.cpp index fc9bbd2..0bfe53c 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -2793,12 +2793,13 @@ void parse_arg(int key, char *arg) if (p) d *= 1e9; opt_max_rate = d; break; - case 'd': // CB + case 'd': // --device { + int device_thr[MAX_GPUS] = { 0 }; int ngpus = cuda_num_devices(); char * pch = strtok (arg,","); opt_n_threads = 0; - while (pch != NULL) { + while (pch != NULL && opt_n_threads < MAX_GPUS) { if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') { if (atoi(pch) < ngpus) @@ -2818,6 +2819,14 @@ void parse_arg(int key, char *arg) } pch = strtok (NULL, ","); } + // count threads per gpu + for (int n=0; n < opt_n_threads; n++) { + int device = device_map[n]; + device_thr[device]++; + } + for (int n=0; n < ngpus; n++) { + gpu_threads = max(gpu_threads, device_thr[n]); + } } break; @@ -3177,8 +3186,8 @@ int main(int argc, char *argv[]) else if (active_gpus > opt_n_threads) active_gpus = opt_n_threads; - // generally doesn't work... let 1 - gpu_threads = opt_n_threads / active_gpus; + // generally doesn't work well... + gpu_threads = max(gpu_threads, opt_n_threads / active_gpus); if (opt_benchmark && opt_algo == ALGO_AUTO) { bench_init(opt_n_threads); diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index 7fdbc41..76a94f6 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -11,23 +11,27 @@ __constant__ uint32_t pTarget[8]; // 32 bytes // store MAX_GPUS device arrays of 8 nonces -static uint32_t* h_resNonces[MAX_GPUS]; -static uint32_t* d_resNonces[MAX_GPUS]; -static bool init_done = false; +static uint32_t* h_resNonces[MAX_GPUS] = { NULL }; +static uint32_t* d_resNonces[MAX_GPUS] = { NULL }; +static __thread bool init_done = false; __host__ void cuda_check_cpu_init(int thr_id, uint32_t threads) { - CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 32)); CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 32)); + CUDA_SAFE_CALL(cudaMallocHost(&h_resNonces[thr_id], 32)); init_done = true; } __host__ void cuda_check_cpu_free(int thr_id) { + if (!init_done) return; cudaFree(d_resNonces[thr_id]); cudaFreeHost(h_resNonces[thr_id]); + d_resNonces[thr_id] = NULL; + h_resNonces[thr_id] = NULL; + init_done = false; } // Target Difficulty @@ -198,7 +202,7 @@ uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounc cuda_checkhash_64_suppl <<>> (startNounce, d_inputHash, d_resNonces[thr_id]); cudaThreadSynchronize(); - cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 32, cudaMemcpyDeviceToHost); rescnt = h_resNonces[thr_id][0]; if (rescnt > numNonce) { if (numNonce <= rescnt) { diff --git a/cuda_fugue256.h b/cuda_fugue256.h index a4852b4..44f3fd0 100644 --- a/cuda_fugue256.h +++ b/cuda_fugue256.h @@ -1,7 +1,7 @@ -#ifndef _CUDA_FUGUE512_H -#define _CUDA_FUGUE512_H +#ifndef _CUDA_FUGUE256_H +#define _CUDA_FUGUE256_H -void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce); +void fugue256_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce); void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn); void fugue256_cpu_init(int thr_id, uint32_t threads); void fugue256_cpu_free(int thr_id); diff --git a/fuguecoin.cpp b/fuguecoin.cpp index 7c703ce..9166774 100644 --- a/fuguecoin.cpp +++ b/fuguecoin.cpp @@ -8,14 +8,6 @@ #include "cuda_fugue256.h" -extern "C" void my_fugue256_init(void *cc); -extern "C" void my_fugue256(void *cc, const void *data, size_t len); -extern "C" void my_fugue256_close(void *cc, void *dst); -extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst); - -// vorbereitete Kontexte nach den ersten 80 Bytes -// sph_fugue256_context ctx_fugue_const[MAX_GPUS]; - #define SWAP32(x) \ ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) @@ -38,11 +30,11 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne uint32_t *ptarget = work->target; uint32_t start_nonce = pdata[19]++; int intensity = (device_sm[device_map[thr_id]] > 500) ? 22 : 19; - uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 256*256*8 + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0xf; + ptarget[7] = 0xf; // init if(!init[thr_id]) @@ -57,7 +49,6 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne for (int kk=0; kk < 20; kk++) be32enc(&endiandata[kk], pdata[kk]); - // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt) fugue256_cpu_setBlock(thr_id, endiandata, (void*)ptarget); do { diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index bcc39d6..418c000 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -114,11 +114,12 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); if (device_sm[dev_id] < 300) { - applog(LOG_ERR, "Device SM 3.0 or more recent required!"); + gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!"); proper_exit(1); return -1; } + api_set_throughput(thr_id, throughput); init[thr_id] = true; } diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu index 47af463..2da167c 100644 --- a/quark/cuda_quark_compactionTest.cu +++ b/quark/cuda_quark_compactionTest.cu @@ -1,3 +1,7 @@ +/* + * REQUIRE SM 3.0 arch! + */ + #include #include @@ -10,6 +14,14 @@ static uint32_t *h_numValid[MAX_GPUS]; static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen +#if __CUDA_ARCH__ < 300 +/** + * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 + */ +#undef __shfl_up +#define __shfl_up(var, delta, width) (0) +#endif + // True/False tester typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); @@ -28,7 +40,8 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS]; // Setup/Alloc Function -__host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) +__host__ +void quark_compactTest_cpu_init(int thr_id, uint32_t threads) { cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); @@ -46,7 +59,8 @@ __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) } // Because all alloc should have a free... -__host__ void quark_compactTest_cpu_free(int thr_id) +__host__ +void quark_compactTest_cpu_free(int thr_id) { cudaFree(d_tempBranch1Nonces[thr_id]); cudaFree(d_numValid[thr_id]); @@ -57,16 +71,9 @@ __host__ void quark_compactTest_cpu_free(int thr_id) cudaFreeHost(h_numValid[thr_id]); } -#if __CUDA_ARCH__ < 300 -/** - * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 - */ -#undef __shfl_up -#define __shfl_up(var, delta, width) (0) -#endif - -// Die Summenfunktion (vom NVIDIA SDK) -__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) +__global__ +void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, + uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { extern __shared__ uint32_t sums[]; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); @@ -91,19 +98,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * { // keine Nonce-Liste inpHash = &inpHashes[id<<4]; - }else - { + } else { // Nonce-Liste verfügbar int nonce = d_validNonceTable[id] - startNounce; inpHash = &inpHashes[nonce<<4]; } value = (*testFunc)(inpHash); - }else - { + } else { value = 0; } - }else - { + } else { value = data[id]; } @@ -115,8 +119,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * // those threads where the thread 'i' away would have // been out of bounds of the warp are unaffected. This // creates the scan sum. -#pragma unroll + #pragma unroll for (int i=1; i<=width; i*=2) { uint32_t n = __shfl_up((int)value, i, width); @@ -147,8 +151,7 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * for (int i=1; i<=width; i*=2) { uint32_t n = __shfl_up((int)warp_sum, i, width); - - if (lane_id >= i) warp_sum += n; + if (lane_id >= i) warp_sum += n; } sums[lane_id] = warp_sum; @@ -178,7 +181,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * } // Uniform add: add partial sums array -__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) +__global__ +void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) { __shared__ uint32_t buf; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); @@ -195,7 +199,8 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums } // Der Scatter -__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) +__global__ +void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { int id = ((blockIdx.x * blockDim.x) + threadIdx.x); uint32_t actNounce = id; @@ -244,9 +249,9 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) return mask; } -__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm, - uint32_t *d_nonces1, cuda_compactTestFunction_t function, - uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) +__host__ +void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,uint32_t *d_nonces1, + cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) { int orgThreads = threads; threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); @@ -300,9 +305,9 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread } ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) -__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, - uint32_t *d_nonces1, uint32_t *d_nonces2, - uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) +__host__ +void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1, + uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) { quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); @@ -339,10 +344,9 @@ __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, */ } -__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, uint32_t *nrm1, - uint32_t *d_nonces2, uint32_t *nrm2, - int order) +__host__ +void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, + uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order) { // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! @@ -356,9 +360,9 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32 *nrm2 = h_numValid[thr_id][1]; } -__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, - uint32_t *d_nonces1, uint32_t *nrm1, - int order) +__host__ +void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, + uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, int order) { // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index 3549cd6..8de252b 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -121,6 +121,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; + int dev_id = device_map[thr_id]; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096 if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); @@ -132,6 +133,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, { cudaSetDevice(device_map[thr_id]); + cudaGetLastError(); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); quark_blake512_cpu_init(thr_id, throughput); @@ -146,6 +148,13 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); + CUDA_SAFE_CALL(cudaGetLastError()); + + if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { + gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!"); + proper_exit(1); + return -1; + } init[thr_id] = true; } diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu index 1f3cbcb..7a68949 100644 --- a/x13/cuda_x13_fugue512.cu +++ b/x13/cuda_x13_fugue512.cu @@ -685,11 +685,6 @@ void x13_fugue512_cpu_init(int thr_id, uint32_t threads) __host__ void x13_fugue512_cpu_free(int thr_id) { - cudaUnbindTexture(mixTab0Tex); - cudaUnbindTexture(mixTab1Tex); - cudaUnbindTexture(mixTab2Tex); - cudaUnbindTexture(mixTab3Tex); - for (int i=0; i<4; i++) cudaFree(d_textures[thr_id][i]); }