Browse Source

reduce lyra2 blake and pentablake cpu load

2upstream
Tanguy Pruvot 10 years ago
parent
commit
a66d78e692
  1. 6
      Algo256/blake256.cu
  2. 4
      lyra2/cuda_lyra2.cu
  3. 9
      pentablake.cu
  4. 2
      qubit/qubit_luffa512.cu

6
Algo256/blake256.cu

@ -257,9 +257,8 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui @@ -257,9 +257,8 @@ uint32_t blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const ui
return result;
blake256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNonce, d_resNonce[thr_id], highTarget, crcsum, (int) rounds);
cudaDeviceSynchronize();
MyStreamSynchronize(NULL, 0, thr_id);
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize(); /* seems no more required */
result = h_resNonce[thr_id][0];
for (int n=0; n < (NBN-1); n++)
extra_results[n] = h_resNonce[thr_id][n+1];
@ -343,9 +342,8 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c @@ -343,9 +342,8 @@ static uint32_t blake256_cpu_hash_16(const int thr_id, const uint32_t threads, c
return result;
blake256_gpu_hash_16 <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], highTarget, (int) rounds, opt_tracegpu);
cudaDeviceSynchronize();
MyStreamSynchronize(NULL, 0, thr_id);
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize(); /* seems no more required */
result = h_resNonce[thr_id][0];
for (int n=0; n < (NBN-1); n++)
extra_results[n] = h_resNonce[thr_id][n+1];

4
lyra2/cuda_lyra2.cu

@ -456,7 +456,7 @@ void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t * @@ -456,7 +456,7 @@ void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *
lyra2_gpu_hash_32_v30 <<<grid, block >>> (threads, startNounce, d_outputHash);
}
cudaDeviceSynchronize();
//MyStreamSynchronize(NULL, order, thr_id);
MyStreamSynchronize(NULL, order, thr_id);
//cudaThreadSynchronize();
}

9
pentablake.cu

@ -316,8 +316,7 @@ void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce, @@ -316,8 +316,7 @@ void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce,
pentablake_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id);
cudaDeviceSynchronize();
MyStreamSynchronize(NULL, order, thr_id);
}
@ -375,8 +374,7 @@ void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint3 @@ -375,8 +374,7 @@ void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint3
pentablake_gpu_hash_64 <<<grid, block, shared_size>>> (threads, startNounce, (uint64_t*)d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id);
cudaDeviceSynchronize();
MyStreamSynchronize(NULL, order, thr_id);
}
#if 0
@ -456,7 +454,7 @@ uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounc @@ -456,7 +454,7 @@ uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounc
pentablake_gpu_check_hash <<<grid, block, shared_size>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
CUDA_SAFE_CALL(cudaThreadSynchronize());
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
cudaThreadSynchronize();
result = h_resNounce[thr_id][0];
@ -560,6 +558,5 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t * @@ -560,6 +558,5 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
cudaDeviceSynchronize();
return rc;
}

2
qubit/qubit_luffa512.cu

@ -466,7 +466,7 @@ uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t start @@ -466,7 +466,7 @@ uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t start
size_t shared_size = 0;
qubit_luffa512_gpu_finalhash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]);
cudaDeviceSynchronize();
cudaThreadSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize();
result = h_resNounce[thr_id][0];

Loading…
Cancel
Save