From 5bf1f9820003b7d0be75af3351d0eb424deb916a Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 14 Oct 2015 02:04:13 +0000 Subject: [PATCH] various fixes for SM 2.1 and the benchmark X11+ algos and quark are not compatible for the moment but these ones are : Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB): blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr. blake : 70208.9 kH/s, 1 MB, 1048576 thr. bmw : 122802.6 kH/s, 65 MB, 2097152 thr. deep : 3533.6 kH/s, 33 MB, 524288 thr. fugue256 : 43177.9 kH/s, 17 MB, 524288 thr. heavy : 4118.2 kH/s, 147 MB, 524032 thr. keccak : 18673.1 kH/s, 129 MB, 2097152 thr. luffa : 28816.0 kH/s, 257 MB, 4194304 thr. lyra2 : 213.7 kH/s, 570 MB, 65536 thr. mjollnir : 3895.6 kH/s, 147 MB, 524032 thr. nist5 : 1101.4 kH/s, 67 MB, 1048576 thr. penta : 501.6 kH/s, 21 MB, 327680 thr. skein : 5432.4 kH/s, 65 MB, 1048576 thr. skein2 : 6788.9 kH/s, 33 MB, 524288 thr. whirlpool : 688.5 kH/s, 33 MB, 524288 thr. zr5 : 122.5 kH/s, 86 MB, 262144 thr. --- Algo256/blake256.cu | 10 ++++++---- Algo256/bmw.cu | 9 ++++----- Algo256/cuda_blake256.cu | 2 +- Algo256/keccak256.cu | 8 ++++---- JHA/jackpotcoin.cu | 15 +++++++++++---- bench.cpp | 15 +++++++++++++++ cuda_groestlcoin.cu | 2 +- lyra2/cuda_lyra2.cu | 1 + pentablake.cu | 11 +++++------ quark/quarkcoin.cu | 5 ++--- qubit/luffa.cu | 10 ++++++---- util.cpp | 2 +- x15/whirlpoolx.cu | 8 ++++---- zr5.cu | 8 ++++++-- 14 files changed, 67 insertions(+), 39 deletions(-) diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu index 0b5dd87..ad2a493 100644 --- a/Algo256/blake256.cu +++ b/Algo256/blake256.cu @@ -10,9 +10,11 @@ extern "C" { #include "sph/sph_blake.h" +//extern int blake256_rounds; +} + #include #include -} /* threads per block and throughput (intensity) */ #define TPB 128 @@ -467,10 +469,10 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non #endif return rc; } - else if (opt_debug) { + else if (vhashcpu[7] > ptarget[7] && opt_debug) { applog_hash((uchar*)ptarget); applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget); - applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce); + gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce); } } @@ -493,7 +495,7 @@ extern "C" void free_blake256(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); + cudaThreadSynchronize(); cudaFreeHost(h_resNonce[thr_id]); cudaFree(d_resNonce[thr_id]); diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu index f4bbb4c..1cf06ea 100644 --- a/Algo256/bmw.cu +++ b/Algo256/bmw.cu @@ -87,7 +87,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u return 1; } else { - applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); + gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce); } } @@ -110,13 +110,12 @@ extern "C" void free_bmw(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); + cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); bmw256_midstate_free(thr_id); cuda_check_cpu_free(thr_id); - init[thr_id] = false; - cudaDeviceSynchronize(); -} \ No newline at end of file + init[thr_id] = false; +} diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu index afa9b3b..c3326e6 100644 --- a/Algo256/cuda_blake256.cu +++ b/Algo256/cuda_blake256.cu @@ -207,7 +207,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin input[3] = startNonce + thread; blake256_compress2nd(h, input, 640); - #pragma unroll + #pragma unroll for (int i = 0; i<4; i++) { Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1])); } diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu index e2234d3..7922912 100644 --- a/Algo256/keccak256.cu +++ b/Algo256/keccak256.cu @@ -80,7 +80,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no return 1; } else { - applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", device_map[thr_id], foundNonce); + gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce); } } @@ -101,12 +101,12 @@ extern "C" void free_keccak256(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); + cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); keccak256_cpu_free(thr_id); - init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file + init[thr_id] = false; +} diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index 6d605e4..b441362 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -95,6 +95,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; + int dev_id = device_map[thr_id]; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); @@ -104,7 +105,13 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); + cuda_get_arch(thr_id); + if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { + gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)"); + proper_exit(EXIT_CODE_CUDA_ERROR); + } + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); @@ -214,6 +221,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc CUDA_LOG_ERROR(); uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + if (foundNonce != UINT32_MAX) { uint32_t vhash64[8]; @@ -239,15 +247,14 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc pdata[19] = foundNonce; return res; } else { - applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", - device_map[thr_id], foundNonce); + gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce); } } if ((uint64_t) pdata[19] + throughput > max_nonce) { *hashes_done = pdata[19] - first_nonce; pdata[19] = max_nonce; - break; + return 0; } pdata[19] += throughput; diff --git a/bench.cpp b/bench.cpp index 8c3f1a2..0a45a41 100644 --- a/bench.cpp +++ b/bench.cpp @@ -97,6 +97,21 @@ bool bench_algo_switch_next(int thr_id) if (algo == ALGO_C11) algo++; // same as x11 if (algo == ALGO_DMD_GR) algo++; // same as groestl if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool + + if (device_sm[dev_id] && device_sm[dev_id] < 300) { + // incompatible SM 2.1 kernels... + if (algo == ALGO_FRESH) algo++; + if (algo == ALGO_GROESTL) algo++; + if (algo == ALGO_MYR_GR) algo++; + if (algo == ALGO_JACKPOT) algo++; + if (algo == ALGO_LYRA2v2) algo++; + if (algo == ALGO_NEOSCRYPT) algo++; + if (algo == ALGO_QUARK) algo++; + if (algo == ALGO_QUBIT) algo++; + if (algo == ALGO_S3) algo++; // to check... + while (algo >= ALGO_X11 && algo <= ALGO_X17) algo++; + if (algo == ALGO_WHIRLPOOLX) algo++; + } // and unwanted ones... if (algo == ALGO_SCRYPT) algo++; if (algo == ALGO_SCRYPT_JANE) algo++; diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index fcef970..4e4b728 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -153,7 +153,7 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo int dev_id = device_map[thr_id]; if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { - printf("Sorry, This algo is not supported by this GPU arch (SM 3.0 required)"); + gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)"); proper_exit(EXIT_CODE_CUDA_ERROR); } diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index e2757ea..82c31a8 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -261,6 +261,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6 int dev_id = device_map[thr_id % MAX_GPUS]; uint32_t tpb = TPB52; if (device_sm[dev_id] == 500) tpb = TPB50; + if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not) if (device_sm[dev_id] <= 300) tpb = TPB30; dim3 grid((threads + tpb - 1) / tpb); diff --git a/pentablake.cu b/pentablake.cu index c54eac3..b0d8132 100644 --- a/pentablake.cu +++ b/pentablake.cu @@ -423,7 +423,6 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n pentablakehash(vhash, endiandata); if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio) work_set_target_ratio(work, vhash); - applog(LOG_NOTICE, "GPU found more than one result yippee!"); pdata[21] = extra_results[0]; extra_results[0] = UINT32_MAX; rc++; @@ -431,7 +430,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n pdata[19] = foundNonce; return rc; } else { - applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", device_map[thr_id], foundNonce); + gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce); } } @@ -449,13 +448,13 @@ void free_pentablake(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); + cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); cudaFreeHost(h_resNounce[thr_id]); cudaFree(d_resNounce[thr_id]); - init[thr_id] = false; - cudaDeviceSynchronize(); -} \ No newline at end of file + + init[thr_id] = false; +} diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index c1a5d3f..f615444 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -262,8 +262,7 @@ extern "C" void free_quark(int thr_id) if (!init[thr_id]) return; - cudaSetDevice(device_map[thr_id]); - cudaDeviceSynchronize(); + cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); @@ -278,4 +277,4 @@ extern "C" void free_quark(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +} diff --git a/qubit/luffa.cu b/qubit/luffa.cu index 2ea6b20..f03efaf 100644 --- a/qubit/luffa.cu +++ b/qubit/luffa.cu @@ -45,8 +45,10 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - if (opt_cudaschedule == -1) // to reduce cpu usage... - cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + //if (opt_cudaschedule == -1) // to reduce cpu usage... + // cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); @@ -103,12 +105,12 @@ extern "C" void free_luffa(int thr_id) if (!init[thr_id]) return; - cudaDeviceSynchronize(); + cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); cuda_check_cpu_free(thr_id); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); init[thr_id] = false; } diff --git a/util.cpp b/util.cpp index 8ed5505..dbee5c9 100644 --- a/util.cpp +++ b/util.cpp @@ -1856,7 +1856,7 @@ static uint32_t zrtest[20] = { void do_gpu_tests(void) { -#ifdef _DEBUG +#if 1 //def _DEBUG unsigned long done; char s[128] = { '\0' }; uchar buf[160]; diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu index ec8db42..b7b356f 100644 --- a/x15/whirlpoolx.cu +++ b/x15/whirlpoolx.cu @@ -9,7 +9,7 @@ extern "C" { #include "miner.h" #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; +static uint32_t *d_hash[MAX_GPUS] = { 0 }; extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads); extern void whirlpoolx_cpu_free(int thr_id); @@ -54,7 +54,7 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); whirlpoolx_cpu_init(thr_id, throughput); @@ -84,7 +84,7 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc pdata[19] = foundNonce; return 1; } else { - applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); } } @@ -115,4 +115,4 @@ extern "C" void free_whirlx(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +} diff --git a/zr5.cu b/zr5.cu index 5d05b01..d591f1d 100644 --- a/zr5.cu +++ b/zr5.cu @@ -419,6 +419,10 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work, } zr5_final_round(thr_id, throughput); + // do not scan results on interuption + if (work_restart[thr_id].restart) + return -1; + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { @@ -455,7 +459,7 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work, } return res; } else { - applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); pdata[19]++; pdata[0] = oldp0; @@ -497,4 +501,4 @@ extern "C" void free_zr5(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +}