reduce lyra2 blake and pentablake cpu load

This commit is contained in:
Tanguy Pruvot 2014-12-18 16:56:21 +01:00
parent 010eba1760
commit a66d78e692
4 changed files with 8 additions and 13 deletions

View File

@ -257,9 +257,8 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
return result; return result;
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds); blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
cudaDeviceSynchronize(); MyStreamSynchronize(NULL, 0, thr_id);
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize(); /* seems no more required */
result = h_resNonce[thr_id][0]; result = h_resNonce[thr_id][0];
for (int n=0; n < (NBN-1); n++) for (int n=0; n < (NBN-1); n++)
extra_results[n] = h_resNonce[thr_id][n+1]; extra_results[n] = h_resNonce[thr_id][n+1];
@ -343,9 +342,8 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c
return result; return result;
blake256_gpu_hash_16 <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu); blake256_gpu_hash_16 <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu);
cudaDeviceSynchronize(); MyStreamSynchronize(NULL, 0, thr_id);
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize(); /* seems no more required */
result = h_resNonce[thr_id][0]; result = h_resNonce[thr_id][0];
for (int n=0; n < (NBN-1); n++) for (int n=0; n < (NBN-1); n++)
extra_results[n] = h_resNonce[thr_id][n+1]; extra_results[n] = h_resNonce[thr_id][n+1];

View File

@ -456,7 +456,7 @@ void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *
lyra2_gpu_hash_32_v30 <<<grid, block >>> (threads, startNounce, d_outputHash); lyra2_gpu_hash_32_v30 <<<grid, block >>> (threads, startNounce, d_outputHash);
} }
cudaDeviceSynchronize(); MyStreamSynchronize(NULL, order, thr_id);
//MyStreamSynchronize(NULL, order, thr_id); //cudaThreadSynchronize();
} }

View File

@ -316,8 +316,7 @@ void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce,
pentablake_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash); pentablake_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
cudaDeviceSynchronize();
} }
@ -375,8 +374,7 @@ void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint3
pentablake_gpu_hash_64 <<<grid, block, shared_size>>> (threads, startNounce, (uint64_t*)d_outputHash); pentablake_gpu_hash_64 <<<grid, block, shared_size>>> (threads, startNounce, (uint64_t*)d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
cudaDeviceSynchronize();
} }
#if 0 #if 0
@ -456,7 +454,7 @@ uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounc
pentablake_gpu_check_hash <<<grid, block, shared_size>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); pentablake_gpu_check_hash <<<grid, block, shared_size>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
CUDA_SAFE_CALL(cudaDeviceSynchronize()); CUDA_SAFE_CALL(cudaThreadSynchronize());
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
cudaThreadSynchronize(); cudaThreadSynchronize();
result = h_resNounce[thr_id][0]; result = h_resNounce[thr_id][0];
@ -560,6 +558,5 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart); } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1; *hashes_done = pdata[19] - first_nonce + 1;
cudaDeviceSynchronize();
return rc; return rc;
} }

View File

@ -466,7 +466,7 @@ uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t start
size_t shared_size = 0; size_t shared_size = 0;
qubit_luffa512_gpu_finalhash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]); qubit_luffa512_gpu_finalhash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]);
cudaDeviceSynchronize(); cudaThreadSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) { if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize(); //cudaThreadSynchronize();
result = h_resNounce[thr_id][0]; result = h_resNounce[thr_id][0];