diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu index 4ecec43..6cbd917 100644 --- a/Algo256/blake256.cu +++ b/Algo256/blake256.cu @@ -257,9 +257,8 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui return result; blake256_gpu_hash_80<<>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds); - cudaDeviceSynchronize(); + MyStreamSynchronize(NULL, 0, thr_id); if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { - //cudaThreadSynchronize(); /* seems no more required */ result = h_resNonce[thr_id][0]; for (int n=0; n < (NBN-1); n++) extra_results[n] = h_resNonce[thr_id][n+1]; @@ -343,9 +342,8 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c return result; blake256_gpu_hash_16 <<>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu); - cudaDeviceSynchronize(); + MyStreamSynchronize(NULL, 0, thr_id); if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { - //cudaThreadSynchronize(); /* seems no more required */ result = h_resNonce[thr_id][0]; for (int n=0; n < (NBN-1); n++) extra_results[n] = h_resNonce[thr_id][n+1]; diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index 3afd225..d8244ec 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -456,7 +456,7 @@ void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t * lyra2_gpu_hash_32_v30 <<>> (threads, startNounce, d_outputHash); } - cudaDeviceSynchronize(); - //MyStreamSynchronize(NULL, order, thr_id); + MyStreamSynchronize(NULL, order, thr_id); + //cudaThreadSynchronize(); } diff --git a/pentablake.cu b/pentablake.cu index 49f49e3..a971910 100644 --- a/pentablake.cu +++ b/pentablake.cu @@ -316,8 +316,7 @@ void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce, pentablake_gpu_hash_80 <<>> (threads, startNounce, d_outputHash); - //MyStreamSynchronize(NULL, order, thr_id); - cudaDeviceSynchronize(); + MyStreamSynchronize(NULL, order, thr_id); } @@ -375,8 +374,7 @@ void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint3 pentablake_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_outputHash); - //MyStreamSynchronize(NULL, order, thr_id); - cudaDeviceSynchronize(); + MyStreamSynchronize(NULL, order, thr_id); } #if 0 @@ -456,7 +454,7 @@ uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounc pentablake_gpu_check_hash <<>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); - CUDA_SAFE_CALL(cudaDeviceSynchronize()); + CUDA_SAFE_CALL(cudaThreadSynchronize()); if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { cudaThreadSynchronize(); result = h_resNounce[thr_id][0]; @@ -560,6 +558,5 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t * } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); *hashes_done = pdata[19] - first_nonce + 1; - cudaDeviceSynchronize(); return rc; } diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu index a4a0ef2..c655538 100644 --- a/qubit/qubit_luffa512.cu +++ b/qubit/qubit_luffa512.cu @@ -466,7 +466,7 @@ uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t start size_t shared_size = 0; qubit_luffa512_gpu_finalhash_80 <<>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]); - cudaDeviceSynchronize(); + cudaThreadSynchronize(); if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) { //cudaThreadSynchronize(); result = h_resNounce[thr_id][0];