diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu index 888d29c..865767e 100644 --- a/Algo256/cuda_groestl256.cu +++ b/Algo256/cuda_groestl256.cu @@ -5,8 +5,8 @@ #include "cuda_helper.h" -uint32_t *d_gnounce[MAX_GPUS]; -uint32_t *d_GNonce[MAX_GPUS]; +static uint32_t *h_GNonces[MAX_GPUS]; +static uint32_t *d_GNonces[MAX_GPUS]; __constant__ uint32_t pTarget[8]; @@ -175,7 +175,7 @@ void groestl256_perm_Q(uint32_t thread, uint32_t *a, char *mixtabs) } __global__ __launch_bounds__(256,1) -void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector) +void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *resNonces) { #if USE_SHARED extern __shared__ char mixtabs[]; @@ -243,7 +243,8 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *out uint32_t nonce = startNounce + thread; if (state[15] <= pTarget[7]) { - nonceVector[0] = nonce; + atomicMin(&resNonces[1], resNonces[0]); + atomicMin(&resNonces[0], nonce); } } } @@ -272,15 +273,15 @@ void groestl256_cpu_init(int thr_id, uint32_t threads) texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256); texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256); - cudaMalloc(&d_GNonce[thr_id], sizeof(uint32_t)); - cudaMallocHost(&d_gnounce[thr_id], 1*sizeof(uint32_t)); + cudaMalloc(&d_GNonces[thr_id], 2*sizeof(uint32_t)); + cudaMallocHost(&h_GNonces[thr_id], 2*sizeof(uint32_t)); } __host__ uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) { uint32_t result = 0xffffffff; - cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t)); + cudaMemset(d_GNonces[thr_id], 0xff, sizeof(uint32_t)); const uint32_t threadsperblock = 256; // berechne wie viele Thread Blocks wir brauchen @@ -292,18 +293,27 @@ uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNoun #else size_t shared_size = 0; #endif - groestl256_gpu_hash32<<>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]); + groestl256_gpu_hash32<<>>(threads, startNounce, d_outputHash, d_GNonces[thr_id]); MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - cudaThreadSynchronize(); - result = *d_gnounce[thr_id]; + + // get first found nonce + cudaMemcpy(h_GNonces[thr_id], d_GNonces[thr_id], 1*sizeof(uint32_t), cudaMemcpyDeviceToHost); + result = *h_GNonces[thr_id]; return result; } +__host__ +uint32_t groestl256_getSecNonce(int thr_id, int num) +{ + uint32_t results[2]; + cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost); + return results[num]; +} + __host__ void groestl256_setTarget(const void *pTargetIn) { - cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice); } diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index dbeab7a..bbc5131 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -9,7 +9,7 @@ extern "C" { #include "miner.h" #include "cuda_helper.h" -static _ALIGN(64) uint64_t *d_hash[MAX_GPUS]; +static uint64_t* d_hash[MAX_GPUS]; extern void blake256_cpu_init(int thr_id, uint32_t threads); extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); @@ -21,9 +21,10 @@ extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_setTarget(const void *ptarget); extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order); -extern void groestl256_cpu_init(int thr_id, uint32_t threads); +extern uint32_t groestl256_getSecNonce(int thr_id, int num); extern "C" void lyra2_hash(void *state, const void *input) { @@ -99,18 +100,33 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata, lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + *hashes_done = pdata[19] - first_nonce + throughput; + foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); if (foundNonce != UINT32_MAX) { + uint32_t _ALIGN(64) vhash64[8]; const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; + be32enc(&endiandata[19], foundNonce); lyra2_hash(vhash64, endiandata); if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { - *hashes_done = pdata[19] - first_nonce + throughput; + int res = 1; + uint32_t secNonce = groestl256_getSecNonce(thr_id, 1); + if (secNonce != UINT32_MAX) + { + be32enc(&endiandata[19], secNonce); + lyra2_hash(vhash64, endiandata); + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { + if (opt_debug) + applog(LOG_BLUE, "GPU #%d: found second nonce %08x", device_map[thr_id], secNonce); + pdata[21] = secNonce; + res++; + } + } pdata[19] = foundNonce; - return 1; + return res; } else { applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); }