diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu index 4a08410..09184ef 100644 --- a/Algo256/blake256.cu +++ b/Algo256/blake256.cu @@ -439,7 +439,7 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non #endif *hashes_done = pdata[19] - first_nonce + throughput; - if (foundNonce != UINT32_MAX) + if (foundNonce != UINT32_MAX && bench_algo == -1) { uint32_t vhashcpu[8]; uint32_t Htarg = (uint32_t)targetHigh; @@ -478,7 +478,8 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non } } - if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) { + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; break; } diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu index c578d98..fc05db2 100644 --- a/Algo256/bmw.cu +++ b/Algo256/bmw.cu @@ -91,7 +91,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u } } - if ((uint64_t) throughput + pdata[19] > max_nonce) { + if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu index d2e3ec1..4d5f315 100644 --- a/Algo256/keccak256.cu +++ b/Algo256/keccak256.cu @@ -46,7 +46,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ptarget[7] = 0x00ff; + ptarget[7] = 0x000f; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); @@ -68,7 +68,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no *hashes_done = pdata[19] - first_nonce + throughput; uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - if (foundNonce != UINT32_MAX) + if (foundNonce != UINT32_MAX && bench_algo < 0) { uint32_t _ALIGN(64) vhash64[8]; be32enc(&endiandata[19], foundNonce); @@ -84,7 +84,8 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no } } - if ((uint64_t) pdata[19] + throughput > max_nonce) { + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; break; } @@ -92,6 +93,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no } while (!work_restart[thr_id].restart); + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index 64d7aab..495057f 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -238,16 +238,17 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc } } - if ((uint64_t) pdata[19] + throughput > max_nonce) { - *hashes_done = pdata[19] - first_nonce; + if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; - return 0; + break; } pdata[19] += throughput; } while (!work_restart[thr_id].restart); + *hashes_done = pdata[19] - first_nonce; + CUDA_LOG_ERROR(); return 0; diff --git a/Makefile.am b/Makefile.am index e53fb46..d1af8f9 100644 --- a/Makefile.am +++ b/Makefile.am @@ -81,7 +81,7 @@ ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @PCIFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(P nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\" -#nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\" +nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\" #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\" #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\" #nvcc_ARCH += -gencode=arch=compute_20,code=\"sm_21,compute_20\" diff --git a/bench.cpp b/bench.cpp index 6e94e31..0a3fdf3 100644 --- a/bench.cpp +++ b/bench.cpp @@ -89,6 +89,9 @@ bool bench_algo_switch_next(int thr_id) int prev_algo = algo; int dev_id = device_map[thr_id % MAX_GPUS]; int mfree, mused; + // doesnt seems enough to prevent device slow down + // after some algo switchs + bool need_reset = (gpu_threads == 1); algo++; @@ -143,6 +146,7 @@ bool bench_algo_switch_next(int thr_id) gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free", (device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree); cuda_reset_device(thr_id, NULL); // force to free the leak + need_reset = false; mfree = cuda_available_memory(thr_id); } // store used memory per algo @@ -152,14 +156,13 @@ bool bench_algo_switch_next(int thr_id) // store to dump a table per gpu later algo_hashrates[thr_id][prev_algo] = hashrate; - // wait the other threads to display logs correctly if (opt_n_threads > 1) { pthread_barrier_wait(&algo_barr); } if (algo == ALGO_AUTO) - return false; + return false; // all algos done // mutex primary used for the stats purge pthread_mutex_lock(&bench_lock); @@ -170,6 +173,9 @@ bool bench_algo_switch_next(int thr_id) thr_hashrates[thr_id] = 0; // reset for minmax64 pthread_mutex_unlock(&bench_lock); + if (need_reset) + cuda_reset_device(thr_id, NULL); + if (thr_id == 0) applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]); diff --git a/ccminer.cpp b/ccminer.cpp index 1ba68c1..b995d33 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -479,9 +479,11 @@ void proper_exit(int reason) reason = app_exit_code; } + pthread_mutex_lock(&stats_lock); if (check_dups) hashlog_purge_all(); stats_purge_all(); + pthread_mutex_unlock(&stats_lock); #ifdef WIN32 timeEndPeriod(1); // else never executed @@ -496,7 +498,7 @@ void proper_exit(int reason) #endif free(opt_syslog_pfx); free(opt_api_allow); - free(work_restart); + //free(work_restart); //free(thr_info); exit(reason); } @@ -1709,18 +1711,22 @@ static void *miner_thread(void *userdata) if (max64 < minmax) { switch (opt_algo) { case ALGO_BLAKECOIN: - case ALGO_BLAKE: - case ALGO_WHIRLPOOLX: minmax = 0x80000000U; break; + case ALGO_BLAKE: case ALGO_BMW: + case ALGO_WHIRLPOOLX: minmax = 0x40000000U; break; + case ALGO_KECCAK: case ALGO_LUFFA: - minmax = 0x2000000; + case ALGO_SKEIN: + case ALGO_SKEIN2: + minmax = 0x1000000; break; case ALGO_C11: case ALGO_DEEP: + case ALGO_HEAVY: case ALGO_LYRA2v2: case ALGO_S3: case ALGO_X11: @@ -1729,7 +1735,6 @@ static void *miner_thread(void *userdata) case ALGO_WHIRLPOOL: minmax = 0x400000; break; - case ALGO_KECCAK: case ALGO_JACKPOT: case ALGO_X14: case ALGO_X15: diff --git a/cuda.cpp b/cuda.cpp index fb32cfa..a9ba63b 100644 --- a/cuda.cpp +++ b/cuda.cpp @@ -176,9 +176,11 @@ void cuda_reset_device(int thr_id, bool *init) } cudaDeviceReset(); if (opt_cudaschedule >= 0) { - cudaSetDevice(dev_id); cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask)); + } else { + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); } + cudaDeviceSynchronize(); } // return free memory in megabytes diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index 20d9e45..7fdbc41 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -121,6 +121,9 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); + if (bench_algo >= 0) // dont interrupt the global benchmark + return UINT32_MAX; + if (!init_done) { applog(LOG_ERR, "missing call to cuda_check_cpu_init"); return UINT32_MAX; @@ -143,6 +146,9 @@ uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); + if (bench_algo >= 0) // dont interrupt the global benchmark + return UINT32_MAX; + if (!init_done) { applog(LOG_ERR, "missing call to cuda_check_cpu_init"); return UINT32_MAX; @@ -237,9 +243,12 @@ uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNoun uint32_t result = UINT32_MAX; + if (bench_algo >= 0) // dont interrupt the global benchmark + return result; + if (!init_done) { applog(LOG_ERR, "missing call to cuda_check_cpu_init"); - return UINT32_MAX; + return result; } cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); diff --git a/cuda_nist5.cu b/cuda_nist5.cu index 744f844..bcc8b3a 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -135,11 +135,17 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, } } + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); out: + *hashes_done = pdata[19] - first_nonce; #ifdef USE_STREAMS for (int i = 0; i < 5; i++) cudaStreamDestroy(stream[i]); diff --git a/fuguecoin.cpp b/fuguecoin.cpp index d7681c8..7c703ce 100644 --- a/fuguecoin.cpp +++ b/fuguecoin.cpp @@ -65,7 +65,9 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne uint32_t foundNounce = UINT32_MAX; fugue256_cpu_hash(thr_id, throughput, pdata[19], NULL, &foundNounce); - if (foundNounce < UINT32_MAX) + *hashes_done = pdata[19] - start_nonce + throughput; + + if (foundNounce < UINT32_MAX && bench_algo < 0) { uint32_t vhash[8]; sph_fugue256_context ctx_fugue; @@ -79,14 +81,13 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne { work_set_target_ratio(work, vhash); pdata[19] = foundNounce; - *hashes_done = foundNounce - start_nonce + 1; return 1; } else { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce); } } - if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) { + if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } @@ -95,7 +96,7 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - start_nonce + 1; + *hashes_done = pdata[19] - start_nonce; return 0; } diff --git a/groestlcoin.cpp b/groestlcoin.cpp index c3f803a..89f178a 100644 --- a/groestlcoin.cpp +++ b/groestlcoin.cpp @@ -39,7 +39,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput); if (opt_benchmark) - ptarget[7] = 0x000ff; + ptarget[7] = 0x001f; if (!init[thr_id]) { @@ -62,7 +62,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi // GPU hash groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce); - if (foundNounce < UINT32_MAX) + if (foundNounce < UINT32_MAX && bench_algo < 0) { uint32_t _ALIGN(64) vhash[8]; endiandata[19] = swab32(foundNounce); @@ -78,14 +78,15 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi } } - if ((uint64_t) pdata[19] + throughput > max_nonce) { - *hashes_done = pdata[19] - start_nonce + 1; + if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - start_nonce; free(outputHash); return 0; diff --git a/heavy/heavy.cu b/heavy/heavy.cu index 936006a..40fb0c9 100644 --- a/heavy/heavy.cu +++ b/heavy/heavy.cu @@ -43,99 +43,99 @@ extern uint32_t *heavy_heftyHashes[MAX_GPUS]; /* Combines top 64-bits from each hash into a single hash */ static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4) { - const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 }; - int bits; - unsigned int i; - uint32_t mask; - unsigned int k; - - /* Transpose first 64 bits of each hash into out */ - memset(out, 0, 32); - bits = 0; - for (i = 7; i >= 6; i--) { - for (mask = 0x80000000; mask; mask >>= 1) { - for (k = 0; k < 4; k++) { - out[(255 - bits)/32] <<= 1; - if ((hash[k][i] & mask) != 0) - out[(255 - bits)/32] |= 1; - bits++; - } - } - } + const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 }; + int bits; + unsigned int i; + uint32_t mask; + unsigned int k; + + /* Transpose first 64 bits of each hash into out */ + memset(out, 0, 32); + bits = 0; + for (i = 7; i >= 6; i--) { + for (mask = 0x80000000; mask; mask >>= 1) { + for (k = 0; k < 4; k++) { + out[(255 - bits)/32] <<= 1; + if ((hash[k][i] & mask) != 0) + out[(255 - bits)/32] |= 1; + bits++; + } + } + } } #ifdef _MSC_VER #include static uint32_t __inline bitsset( uint32_t x ) { - DWORD r = 0; - _BitScanReverse(&r, x); - return r; + DWORD r = 0; + _BitScanReverse(&r, x); + return r; } #else static uint32_t bitsset( uint32_t x ) { - return 31-__builtin_clz(x); + return 31-__builtin_clz(x); } #endif // Finde das high bit in einem Multiword-Integer. static int findhighbit(const uint32_t *ptarget, int words) { - int i; - int highbit = 0; - for (i=words-1; i >= 0; --i) - { - if (ptarget[i] != 0) { - highbit = i*32 + bitsset(ptarget[i])+1; - break; - } - } - return highbit; + int i; + int highbit = 0; + for (i=words-1; i >= 0; --i) + { + if (ptarget[i] != 0) { + highbit = i*32 + bitsset(ptarget[i])+1; + break; + } + } + return highbit; } // Generiere ein Multiword-Integer das die Zahl // (2 << highbit) - 1 repräsentiert. static void genmask(uint32_t *ptarget, int words, int highbit) { - int i; - for (i=words-1; i >= 0; --i) - { - if ((i+1)*32 <= highbit) - ptarget[i] = UINT32_MAX; - else if (i*32 > highbit) - ptarget[i] = 0x00000000; - else - ptarget[i] = (1 << (highbit-i*32)) - 1; - } + int i; + for (i=words-1; i >= 0; --i) + { + if ((i+1)*32 <= highbit) + ptarget[i] = UINT32_MAX; + else if (i*32 > highbit) + ptarget[i] = 0x00000000; + else + ptarget[i] = (1 << (highbit-i*32)) - 1; + } } struct check_nonce_for_remove { - check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) : - m_target(target), - m_hashes(hashes), - m_hashlen(hashlen), - m_startNonce(startNonce) { } - - uint64_t m_target; - uint32_t *m_hashes; - uint32_t m_hashlen; - uint32_t m_startNonce; - - __device__ - bool operator()(const uint32_t x) - { - // Position im Hash Buffer - uint32_t hashIndex = x - m_startNonce; - // Wert des Hashes (als uint64_t) auslesen. - // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits) - uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6])); - bool res = (hashValue & m_target) != hashValue; - //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target); - // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. - return res; - } + check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) : + m_target(target), + m_hashes(hashes), + m_hashlen(hashlen), + m_startNonce(startNonce) { } + + uint64_t m_target; + uint32_t *m_hashes; + uint32_t m_hashlen; + uint32_t m_startNonce; + + __device__ + bool operator()(const uint32_t x) + { + // Position im Hash Buffer + uint32_t hashIndex = x - m_startNonce; + // Wert des Hashes (als uint64_t) auslesen. + // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits) + uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6])); + bool res = (hashValue & m_target) != hashValue; + //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target); + // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein. + return res; + } }; static bool init[MAX_GPUS] = { 0 }; @@ -143,245 +143,252 @@ static bool init[MAX_GPUS] = { 0 }; __host__ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen) { - uint32_t *pdata = work->data; - uint32_t *ptarget = work->target; - const uint32_t first_nonce = pdata[19]; - // CUDA will process thousands of threads. - uint32_t throughput = cuda_default_throughput(thr_id, (1U << 19) - 256); - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); - - int rc = 0; - uint32_t *hash = NULL; - uint32_t *cpu_nonceVector = NULL; - CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t))); - - int nrmCalls[6]; - memset(nrmCalls, 0, sizeof(int) * 6); - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00ff; - - // für jeden Hash ein individuelles Target erstellen basierend - // auf dem höchsten Bit, das in ptarget gesetzt ist. - int highbit = findhighbit(ptarget, 8); - uint32_t target2[2], target3[2], target4[2], target5[2]; - genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256 - genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512 - genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512 - genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512 - - if (!init[thr_id]) - { - hefty_cpu_init(thr_id, throughput); - sha256_cpu_init(thr_id, throughput); - keccak512_cpu_init(thr_id, throughput); - groestl512_cpu_init(thr_id, throughput); - blake512_cpu_init(thr_id, throughput); - combine_cpu_init(thr_id, throughput); - - CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput)); - - init[thr_id] = true; - } - - if (blocklen == HEAVYCOIN_BLKHDR_SZ) - { - uint16_t *ext = (uint16_t *)&pdata[20]; - - if (opt_vote > maxvote && !opt_benchmark) { - applog(LOG_WARNING, "Your block reward vote (%hu) exceeds " - "the maxvote reported by the pool (%hu).", - opt_vote, maxvote); - } - - if (opt_trust_pool && opt_vote > maxvote) { - applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool."); - ext[0] = maxvote; - } - else - ext[0] = opt_vote; - } - - // Setze die Blockdaten - hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen); - sha256_cpu_setBlock(pdata, blocklen); - keccak512_cpu_setBlock(pdata, blocklen); - groestl512_cpu_setBlock(pdata, blocklen); - blake512_cpu_setBlock(pdata, blocklen); - - do { - uint32_t actualNumberOfValuesInNonceVectorGPU = throughput; - - ////// Compaction init - - hefty_cpu_hash(thr_id, throughput, pdata[19]); - sha256_cpu_hash(thr_id, throughput, pdata[19]); - - // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier. - MyStreamSynchronize(NULL, 1, thr_id); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + // CUDA will process thousands of threads. + uint32_t throughput = cuda_default_throughput(thr_id, (1U << 19) - 256); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + int rc = 0; + uint32_t *hash = NULL; + uint32_t *cpu_nonceVector = NULL; + + int nrmCalls[6]; + memset(nrmCalls, 0, sizeof(int) * 6); + + if (opt_benchmark) + ptarget[7] = 0x000f; + + // für jeden Hash ein individuelles Target erstellen basierend + // auf dem höchsten Bit, das in ptarget gesetzt ist. + int highbit = findhighbit(ptarget, 8); + uint32_t target2[2], target3[2], target4[2], target5[2]; + genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256 + genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512 + genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512 + genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512 + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + + hefty_cpu_init(thr_id, throughput); + sha256_cpu_init(thr_id, throughput); + keccak512_cpu_init(thr_id, throughput); + groestl512_cpu_init(thr_id, throughput); + blake512_cpu_init(thr_id, throughput); + combine_cpu_init(thr_id, throughput); + + CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput)); + + init[thr_id] = true; + } + + // weird but require at least one cudaSetDevice first + CUDA_SAFE_CALL(cudaMallocHost(&hash, (size_t) 32 * throughput)); + CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, sizeof(uint32_t) * throughput)); + + if (blocklen == HEAVYCOIN_BLKHDR_SZ) + { + uint16_t *ext = (uint16_t*) &pdata[20]; + + if (opt_vote > maxvote && !opt_benchmark) { + applog(LOG_WARNING, "Your block reward vote (%hu) exceeds the maxvote reported by the pool (%hu).", + opt_vote, maxvote); + } + + if (opt_trust_pool && opt_vote > maxvote) { + applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool."); + ext[0] = maxvote; + } + else + ext[0] = opt_vote; + } + + // Setze die Blockdaten + hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen); + sha256_cpu_setBlock(pdata, blocklen); + keccak512_cpu_setBlock(pdata, blocklen); + groestl512_cpu_setBlock(pdata, blocklen); + blake512_cpu_setBlock(pdata, blocklen); + + do { + uint32_t actualNumberOfValuesInNonceVectorGPU = throughput; + + ////// Compaction init + + hefty_cpu_hash(thr_id, throughput, pdata[19]); + sha256_cpu_hash(thr_id, throughput, pdata[19]); + + // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier. + MyStreamSynchronize(NULL, 1, thr_id); #if USE_THRUST - thrust::device_ptr devNoncePtr(heavy_nonceVector[thr_id]); - thrust::device_ptr devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput); - - ////// Compaction - uint64_t *t = (uint64_t*) target2; - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - - ////// Compaction - t = (uint64_t*) target3; - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - - ////// Compaction - t = (uint64_t*) target5; - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); - - ////// Compaction - t = (uint64_t*) target4; - devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19])); - actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); + thrust::device_ptr devNoncePtr(heavy_nonceVector[thr_id]); + thrust::device_ptr devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput); + + ////// Compaction + uint64_t *t = (uint64_t*) target2; + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19])); + actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); + if(actualNumberOfValuesInNonceVectorGPU == 0) + goto emptyNonceVector; + + keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); + + ////// Compaction + t = (uint64_t*) target3; + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19])); + actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); + if(actualNumberOfValuesInNonceVectorGPU == 0) + goto emptyNonceVector; + + blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); + + ////// Compaction + t = (uint64_t*) target5; + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19])); + actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); + if(actualNumberOfValuesInNonceVectorGPU == 0) + goto emptyNonceVector; + + groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]); + + ////// Compaction + t = (uint64_t*) target4; + devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19])); + actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr); #else - // todo - actualNumberOfValuesInNonceVectorGPU = 0; + // todo (nvlabs cub ?) + actualNumberOfValuesInNonceVectorGPU = 0; #endif - if(actualNumberOfValuesInNonceVectorGPU == 0) - goto emptyNonceVector; - - // combine - combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash); - - if (opt_tracegpu) { - applog(LOG_BLUE, "heavy GPU hash:"); - applog_hash((uchar*)hash); - } - - // Ergebnisse kopieren - if(actualNumberOfValuesInNonceVectorGPU > 0) - { - size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU; - CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost)); - cudaThreadSynchronize(); - - for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++) - { - uint32_t nonce = cpu_nonceVector[i]; - uint32_t *foundhash = &hash[8*i]; - if (foundhash[7] <= ptarget[7] && fulltest(foundhash, ptarget)) { - uint32_t vhash[8]; - pdata[19] += nonce - pdata[19]; - heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen); - if (memcmp(vhash, foundhash, 32)) { - gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce); - } else { - *hashes_done = pdata[19] - first_nonce; - work_set_target_ratio(work, vhash); - rc = 1; - goto exit; - } - } - } - } + if(actualNumberOfValuesInNonceVectorGPU == 0) + goto emptyNonceVector; + + // combine + combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash); + + if (opt_tracegpu) { + applog(LOG_BLUE, "heavy GPU hash:"); + applog_hash((uchar*)hash); + } + + // Ergebnisse kopieren + if(actualNumberOfValuesInNonceVectorGPU > 0) + { + size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU; + cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost); + + for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++) + { + uint32_t nonce = cpu_nonceVector[i]; + uint32_t *foundhash = &hash[8*i]; + if (foundhash[7] <= ptarget[7] && fulltest(foundhash, ptarget)) { + uint32_t vhash[8]; + pdata[19] += nonce - pdata[19]; + heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen); + if (memcmp(vhash, foundhash, 32)) { + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce); + } else { + work_set_target_ratio(work, vhash); + rc = 1; + goto exit; + } + } + } + } emptyNonceVector: + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - pdata[19] += throughput; - - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce; + } while (!work_restart[thr_id].restart); exit: - cudaFreeHost(cpu_nonceVector); - cudaFreeHost(hash); - return rc; + *hashes_done = pdata[19] - first_nonce; + + cudaFreeHost(cpu_nonceVector); + cudaFreeHost(hash); + CUDA_LOG_ERROR(); + + return rc; } // cleanup extern "C" void free_heavy(int thr_id) { - if (!init[thr_id]) - return; + if (!init[thr_id]) + return; - cudaThreadSynchronize(); + cudaThreadSynchronize(); - cudaFree(heavy_nonceVector[thr_id]); + cudaFree(heavy_nonceVector[thr_id]); - blake512_cpu_free(thr_id); - groestl512_cpu_free(thr_id); - hefty_cpu_free(thr_id); - keccak512_cpu_free(thr_id); - sha256_cpu_free(thr_id); - combine_cpu_free(thr_id); + blake512_cpu_free(thr_id); + groestl512_cpu_free(thr_id); + hefty_cpu_free(thr_id); + keccak512_cpu_free(thr_id); + sha256_cpu_free(thr_id); + combine_cpu_free(thr_id); - init[thr_id] = false; + init[thr_id] = false; - cudaDeviceSynchronize(); + cudaDeviceSynchronize(); } __host__ void heavycoin_hash(uchar* output, const uchar* input, int len) { - unsigned char hash1[32]; - unsigned char hash2[32]; - uint32_t hash3[16]; - uint32_t hash4[16]; - uint32_t hash5[16]; - uint32_t *final; - SHA256_CTX ctx; - sph_keccak512_context keccakCtx; - sph_groestl512_context groestlCtx; - sph_blake512_context blakeCtx; - - HEFTY1(input, len, hash1); - - /* HEFTY1 is new, so take an extra security measure to eliminate - * the possiblity of collisions: - * - * Hash(x) = SHA256(x + HEFTY1(x)) - * - * N.B. '+' is concatenation. - */ - SHA256_Init(&ctx); - SHA256_Update(&ctx, input, len); - SHA256_Update(&ctx, hash1, sizeof(hash1)); - SHA256_Final(hash2, &ctx); - - /* Additional security: Do not rely on a single cryptographic hash - * function. Instead, combine the outputs of 4 of the most secure - * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512 - * and BLAKE512. - */ - - sph_keccak512_init(&keccakCtx); - sph_keccak512(&keccakCtx, input, len); - sph_keccak512(&keccakCtx, hash1, sizeof(hash1)); - sph_keccak512_close(&keccakCtx, (void *)&hash3); - - sph_groestl512_init(&groestlCtx); - sph_groestl512(&groestlCtx, input, len); - sph_groestl512(&groestlCtx, hash1, sizeof(hash1)); - sph_groestl512_close(&groestlCtx, (void *)&hash4); - - sph_blake512_init(&blakeCtx); - sph_blake512(&blakeCtx, input, len); - sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)); - sph_blake512_close(&blakeCtx, (void *)&hash5); - - final = (uint32_t *)output; - combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5); + unsigned char hash1[32]; + unsigned char hash2[32]; + uint32_t hash3[16]; + uint32_t hash4[16]; + uint32_t hash5[16]; + uint32_t *final; + SHA256_CTX ctx; + sph_keccak512_context keccakCtx; + sph_groestl512_context groestlCtx; + sph_blake512_context blakeCtx; + + HEFTY1(input, len, hash1); + + /* HEFTY1 is new, so take an extra security measure to eliminate + * the possiblity of collisions: + * + * Hash(x) = SHA256(x + HEFTY1(x)) + * + * N.B. '+' is concatenation. + */ + SHA256_Init(&ctx); + SHA256_Update(&ctx, input, len); + SHA256_Update(&ctx, hash1, sizeof(hash1)); + SHA256_Final(hash2, &ctx); + + /* Additional security: Do not rely on a single cryptographic hash + * function. Instead, combine the outputs of 4 of the most secure + * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512 + * and BLAKE512. + */ + + sph_keccak512_init(&keccakCtx); + sph_keccak512(&keccakCtx, input, len); + sph_keccak512(&keccakCtx, hash1, sizeof(hash1)); + sph_keccak512_close(&keccakCtx, (void *)&hash3); + + sph_groestl512_init(&groestlCtx); + sph_groestl512(&groestlCtx, input, len); + sph_groestl512(&groestlCtx, hash1, sizeof(hash1)); + sph_groestl512_close(&groestlCtx, (void *)&hash4); + + sph_blake512_init(&blakeCtx); + sph_blake512(&blakeCtx, input, len); + sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)); + sph_blake512_close(&blakeCtx, (void *)&hash5); + + final = (uint32_t *)output; + combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5); } diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index 05ffb52..d74bb16 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -122,14 +122,14 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, int order = 0; uint32_t foundNonce; - *hashes_done = pdata[19] - first_nonce + throughput; - blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); TRACE("S") + *hashes_done = pdata[19] - first_nonce + throughput; + foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); if (foundNonce != UINT32_MAX) { @@ -162,10 +162,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, } } + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index 67e2dc9..bcc39d6 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -96,14 +96,13 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc cudaDeviceReset(); // reduce cpu usage cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); } - CUDA_LOG_ERROR(); blake256_cpu_init(thr_id, throughput); keccak256_cpu_init(thr_id,throughput); skein256_cpu_init(thr_id, throughput); bmw256_cpu_init(thr_id, throughput); - CUDA_LOG_ERROR(); // SM 3 implentation requires a bit more memory if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500) @@ -175,11 +174,15 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc } } + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } pdata[19] += throughput; - } while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput))); + } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp index f27d060..4311120 100644 --- a/myriadgroestl.cpp +++ b/myriadgroestl.cpp @@ -43,7 +43,7 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned uint32_t *outputHash = (uint32_t*)malloc(throughput * 64); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ptarget[7] = 0x0000ff; // init if(!init[thr_id]) @@ -63,11 +63,11 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned // GPU uint32_t foundNounce = UINT32_MAX; - *hashes_done = pdata[19] - start_nonce + throughput; - myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce); - if (foundNounce < UINT32_MAX) + *hashes_done = pdata[19] - start_nonce + throughput; + + if (foundNounce < UINT32_MAX && bench_algo < 0) { uint32_t _ALIGN(64) vhash[8]; endiandata[19] = swab32(foundNounce); @@ -82,8 +82,7 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned } } - if ((uint64_t) pdata[19] + throughput > max_nonce) { - *hashes_done = pdata[19] - start_nonce; + if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } @@ -91,6 +90,8 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned } while (!work_restart[thr_id].restart); + *hashes_done = max_nonce - start_nonce; + free(outputHash); return 0; } diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp index 0e07845..b0cb1cb 100644 --- a/neoscrypt/neoscrypt.cpp +++ b/neoscrypt/neoscrypt.cpp @@ -79,11 +79,16 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign } } + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - } while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput))); + } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/pentablake.cu b/pentablake.cu index 96eb2a1..2abdeb7 100644 --- a/pentablake.cu +++ b/pentablake.cu @@ -110,9 +110,14 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n } } + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); return rc; } diff --git a/qubit/deep.cu b/qubit/deep.cu index afbe9fb..a5d4c63 100644 --- a/qubit/deep.cu +++ b/qubit/deep.cu @@ -117,9 +117,14 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, } } + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce + 1; return 0; diff --git a/qubit/luffa.cu b/qubit/luffa.cu index afbf545..db283cb 100644 --- a/qubit/luffa.cu +++ b/qubit/luffa.cu @@ -36,19 +36,21 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - uint32_t throughput = cuda_default_throughput(thr_id, 1U << 22); // 256*256*8*8 + uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000f; + ptarget[7] = 0x0000f; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - CUDA_LOG_ERROR(); - //if (opt_cudaschedule == -1) // to reduce cpu usage... - // cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); - //CUDA_LOG_ERROR(); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); @@ -65,10 +67,9 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, cuda_check_cpu_setTarget(ptarget); do { - int order = 0; - *hashes_done = pdata[19] - first_nonce + throughput; + qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], 0); - qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++); + *hashes_done = pdata[19] - first_nonce + throughput; uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) @@ -86,8 +87,8 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, } } - if ((uint64_t) throughput + pdata[19] > max_nonce) { - // pdata[19] = max_nonce; + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; break; } @@ -95,7 +96,7 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/skein.cu b/skein.cu index f178a78..97a7c0b 100644 --- a/skein.cu +++ b/skein.cu @@ -453,9 +453,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no } } - if ((uint64_t) throughput + pdata[19] > max_nonce) { - //applog(LOG_DEBUG, "done... max=%u", max_nonce); - *hashes_done = pdata[19] - first_nonce; + if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } @@ -464,6 +462,8 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no } while (!work_restart[thr_id].restart); + *hashes_done = pdata[19] - first_nonce; + return 0; } diff --git a/skein2.cpp b/skein2.cpp index b05ccb3..38c0a5b 100644 --- a/skein2.cpp +++ b/skein2.cpp @@ -110,8 +110,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned } } - if ((uint64_t) throughput + pdata[19] > max_nonce) { - *hashes_done = pdata[19] - first_nonce; + if ((uint64_t) throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } @@ -120,6 +119,8 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned } while (!work_restart[thr_id].restart); + *hashes_done = pdata[19] - first_nonce; + return 0; } diff --git a/x11/c11.cu b/x11/c11.cu index 0c7e25d..b48e85e 100644 --- a/x11/c11.cu +++ b/x11/c11.cu @@ -200,14 +200,20 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u } else { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); pdata[19] = foundNonce + 1; + continue; } } + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } @@ -228,4 +234,4 @@ extern "C" void free_c11(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} \ No newline at end of file +} diff --git a/x11/fresh.cu b/x11/fresh.cu index 8e2b42a..5e3bf38 100644 --- a/x11/fresh.cu +++ b/x11/fresh.cu @@ -78,18 +78,19 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00ff; + ptarget[7] = 0x00ff; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); + CUDA_LOG_ERROR(); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput + 4), -1); x11_shavite512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4), 0); - cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; @@ -101,8 +102,6 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, x11_shavite512_setBlock_80((void*)endiandata); cuda_check_cpu_setTarget(ptarget); do { - uint32_t Htarg = ptarget[7]; - uint32_t foundNonce; int order = 0; @@ -128,7 +127,7 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, be32enc(&endiandata[19], foundNonce); fresh_hash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); work_set_target_ratio(work, vhash64); diff --git a/x11/s3.cu b/x11/s3.cu index 45609c8..97e6068 100644 --- a/x11/s3.cu +++ b/x11/s3.cu @@ -147,11 +147,16 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un } } + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/x11/x11.cu b/x11/x11.cu index d2e93ca..367aa00 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -138,7 +138,7 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u if (x11_simd512_cpu_init(thr_id, throughput) != 0) { return 0; } - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ? + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); cuda_check_cpu_init(thr_id, throughput); @@ -205,14 +205,19 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u } else { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); pdata[19] = foundNonce + 1; + continue; } } + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/x13/x13.cu b/x13/x13.cu index faa73ab..39e0fe2 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -206,7 +206,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u } } - if ((uint64_t)pdata[19] + throughput > (uint64_t)max_nonce) { + if ((uint64_t)throughput + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } @@ -214,7 +214,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; CUDA_LOG_ERROR(); diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu index 31f6466..4fac950 100644 --- a/x15/whirlpool.cu +++ b/x15/whirlpool.cu @@ -83,12 +83,14 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uint32_t foundNonce; int order = 0; + *hashes_done = pdata[19] - first_nonce + throughput; + whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - if (foundNonce != UINT32_MAX) + if (foundNonce != UINT32_MAX && bench_algo < 0) { const uint32_t Htarg = ptarget[7]; uint32_t vhash[8]; @@ -97,7 +99,6 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { int res = 1; - *hashes_done = pdata[19] - first_nonce + throughput; work_set_target_ratio(work, vhash); #if 0 uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); @@ -112,9 +113,13 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); } } + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce; return 0; diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu index 4416512..517d5ca 100644 --- a/x15/whirlpoolx.cu +++ b/x15/whirlpoolx.cu @@ -49,12 +49,12 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ptarget[7] = 0x000f; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1); whirlpoolx_cpu_init(thr_id, throughput); @@ -70,15 +70,16 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc do { uint32_t foundNonce = whirlpoolx_cpu_hash(thr_id, throughput, pdata[19]); - if (foundNonce != UINT32_MAX) + + *(hashes_done) = pdata[19] - first_nonce + throughput; + + if (foundNonce != UINT32_MAX && bench_algo < 0) { const uint32_t Htarg = ptarget[7]; uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce); whirlxHash(vhash64, endiandata); - *hashes_done = pdata[19] - first_nonce + throughput; - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { work_set_target_ratio(work, vhash64); pdata[19] = foundNonce; @@ -88,15 +89,16 @@ extern "C" int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonc } } - pdata[19] += throughput; - - if (((uint64_t)pdata[19]+throughput) >= max_nonce) { + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; break; } + pdata[19] += throughput; + } while (!work_restart[thr_id].restart); - *(hashes_done) = pdata[19] - first_nonce + 1; + *(hashes_done) = pdata[19] - first_nonce; return 0; } diff --git a/x15/x14.cu b/x15/x14.cu index feaf6d4..d020f5c 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -219,13 +219,19 @@ extern "C" int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); } } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); CUDA_LOG_ERROR(); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/x15/x15.cu b/x15/x15.cu index d74dfa6..6d4799d 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -226,11 +226,16 @@ extern "C" int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, } } + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/x17/x17.cu b/x17/x17.cu index ebf2b4f..3be51d1 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -251,11 +251,16 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u } } + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce; return 0; }