From 2247605d23bfcd35a8fd351861319e58d4f33229 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 6 Nov 2015 03:40:32 +0100 Subject: [PATCH] quark: add support for SM 2 devices todo: use nonce vectors for the second branch GPU #0: Gigabyte GTX 460, 261.26 kH/s accepted: 2/2 (diff 0.046), 254.36 kH/s yay!!! Signed-off-by: Tanguy Pruvot --- bench.cpp | 1 - ccminer.cpp | 4 +- quark/cuda_bmw512_sm3.cuh | 4 +- quark/cuda_quark_compactionTest.cu | 156 ++++++++++++++++------------ quark/cuda_quark_groestl512_sm20.cu | 4 +- quark/quarkcoin.cu | 141 ++++++++++++++++--------- util.cpp | 3 +- 7 files changed, 191 insertions(+), 122 deletions(-) diff --git a/bench.cpp b/bench.cpp index 0256151..bd71fd3 100644 --- a/bench.cpp +++ b/bench.cpp @@ -105,7 +105,6 @@ bool bench_algo_switch_next(int thr_id) if (algo == ALGO_JACKPOT) algo++; // compact shuffle if (algo == ALGO_LYRA2v2) algo++; if (algo == ALGO_NEOSCRYPT) algo++; - if (algo == ALGO_QUARK) algo++; // todo if (algo == ALGO_WHIRLPOOLX) algo++; } // and unwanted ones... diff --git a/ccminer.cpp b/ccminer.cpp index bf09cc1..62ef352 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -333,8 +333,8 @@ struct option options[] = { { "no-stratum", 0, NULL, 1007 }, { "no-autotune", 0, NULL, 1004 }, // scrypt { "interactive", 1, NULL, 1050 }, // scrypt - { "launch-config", 0, NULL, 'l' }, // scrypt - { "lookup-gap", 0, NULL, 'L' }, // scrypt + { "launch-config", 1, NULL, 'l' }, // scrypt + { "lookup-gap", 1, NULL, 'L' }, // scrypt { "texture-cache", 1, NULL, 1051 },// scrypt { "max-temp", 1, NULL, 1060 }, { "max-diff", 1, NULL, 1061 }, diff --git a/quark/cuda_bmw512_sm3.cuh b/quark/cuda_bmw512_sm3.cuh index 247c9f0..c0f4694 100644 --- a/quark/cuda_bmw512_sm3.cuh +++ b/quark/cuda_bmw512_sm3.cuh @@ -17,7 +17,7 @@ q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \ q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) -#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500) +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500) || defined(_DEBUG) __constant__ uint64_t d_constMem[16] = { SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), @@ -42,7 +42,7 @@ __constant__ uint64_t d_constMem[16] = { # endif #endif -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 500 +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 500 || defined(_DEBUG)) __device__ void Compression512_30(uint64_t *msg, uint64_t *hash) diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu index 2da167c..2f19753 100644 --- a/quark/cuda_quark_compactionTest.cu +++ b/quark/cuda_quark_compactionTest.cu @@ -8,10 +8,8 @@ #include "cuda_helper.h" #include -static uint32_t *d_tempBranch1Nonces[MAX_GPUS]; -static uint32_t *d_numValid[MAX_GPUS]; static uint32_t *h_numValid[MAX_GPUS]; - +static uint32_t *d_tempBranch1Nonces[MAX_GPUS]; static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen #if __CUDA_ARCH__ < 300 @@ -43,32 +41,39 @@ cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[M __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) { + int dev_id = device_map[thr_id]; + cuda_get_arch(thr_id); + cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); - // wir brauchen auch Speicherplatz auf dem Device - cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); - cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); - cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); - - uint32_t s1; - s1 = (threads / 256) * 2; + if (cuda_arch[dev_id] >= 300) { + uint32_t s1 = (threads / 256) * 2; + CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2)); + CUDA_SAFE_CALL(cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block) + CUDA_SAFE_CALL(cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block) + } else { + CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads)); + } - cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) - cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) + cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); } // Because all alloc should have a free... __host__ void quark_compactTest_cpu_free(int thr_id) { - cudaFree(d_tempBranch1Nonces[thr_id]); - cudaFree(d_numValid[thr_id]); - - cudaFree(d_partSum[0][thr_id]); - cudaFree(d_partSum[1][thr_id]); + int dev_id = device_map[thr_id]; cudaFreeHost(h_numValid[thr_id]); + + if (cuda_arch[dev_id] >= 300) { + cudaFree(d_tempBranch1Nonces[thr_id]); + cudaFree(d_partSum[0][thr_id]); + cudaFree(d_partSum[1][thr_id]); + } else { + cudaFree(d_tempBranch1Nonces[thr_id]); + } } __global__ @@ -124,7 +129,6 @@ void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *parti for (int i=1; i<=width; i*=2) { uint32_t n = __shfl_up((int)value, i, width); - if (lane_id >= i) value += n; } @@ -207,14 +211,12 @@ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTe uint32_t value; if (id < threads) { -// uint32_t nounce = startNounce + id; uint32_t *inpHash; if(d_validNonceTable == NULL) { // keine Nonce-Liste inpHash = &inpHashes[id<<4]; - }else - { + } else { // Nonce-Liste verfügbar int nonce = d_validNonceTable[id] - startNounce; actNounce = nonce; @@ -222,13 +224,11 @@ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTe } value = (*testFunc)(inpHash); - }else - { + } else { value = 0; } - if( value ) - { + if (value) { int idx = sum[id]; if(idx > 0) outp[idx-1] = startNounce + actNounce; @@ -271,12 +271,10 @@ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32 d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); // weitere Scans - if(callThrid) - { + if(callThrid) { quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); - }else - { + } else { quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); } @@ -290,8 +288,7 @@ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32 // Addieren - if(callThrid) - { + if(callThrid) { quark_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); } quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); @@ -304,6 +301,68 @@ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32 cudaStreamSynchronize(NULL); } +#ifdef __INTELLISENSE__ +#define atomicAdd(x,n) ( *(x)+=n ) +#endif + +__global__ __launch_bounds__(128, 8) +void quark_filter_gpu_sm2(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch, uint32_t &count) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t); + uint4 *psrc = (uint4*) (&d_hash[offset]); + d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 0x8; + if (d_NonceBranch[thread]) return; + //uint32_t off_br = atomicAdd(&count, 1) * 16U; + // uint4 = 4x uint32_t = 16 bytes + uint4 *pdst = (uint4*) (&d_branch2[offset]); + pdst[0] = psrc[0]; + pdst[1] = psrc[1]; + pdst[2] = psrc[2]; + pdst[3] = psrc[3]; + } +} + +__global__ __launch_bounds__(128, 8) +void quark_merge_gpu_sm2(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads && !d_NonceBranch[thread]) + { + const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t); + uint4 *pdst = (uint4*) (&d_hash[offset]); + uint4 *psrc = (uint4*) (&d_branch2[offset]); + pdst[0] = psrc[0]; + pdst[1] = psrc[1]; + pdst[2] = psrc[2]; + pdst[3] = psrc[3]; + } +} + +__host__ +uint32_t quark_filter_cpu_sm2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2) +{ + uint32_t branch2_nonces = 0; + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // copy all hashes in the right branch buffer + quark_filter_gpu_sm2 <<>> (threads, inpHashes, d_branch2, d_tempBranch1Nonces[thr_id], branch2_nonces); + return branch2_nonces; +} + +__host__ +void quark_merge_cpu_sm2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // copy second branch hashes to d_hash + quark_merge_gpu_sm2 <<>> (threads, outpHashes, d_branch2, d_tempBranch1Nonces[thr_id]); +} + ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1, @@ -311,37 +370,6 @@ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t { quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); - - /* - // threadsPerBlock ausrechnen - int blockSize = 256; - int thr1 = threads / blockSize; - int thr2 = threads / (blockSize*blockSize); - - // 1 - quark_compactTest_gpu_SCAN<<>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); - quark_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); - quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); - cudaStreamSynchronize(NULL); - cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - quark_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); - quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); - - // 2 - quark_compactTest_gpu_SCAN<<>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); - quark_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); - quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); - cudaStreamSynchronize(NULL); - cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); - quark_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); - quark_compactTest_gpu_ADD<<>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); - - // Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten - // Schritt 3: Scatter - quark_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes); - quark_compactTest_gpu_SCATTER<<>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes); - cudaStreamSynchronize(NULL); - */ } __host__ @@ -369,6 +397,6 @@ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, ui quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); - cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser + cudaStreamSynchronize(NULL); *nrm1 = h_numValid[thr_id][0]; -} +} \ No newline at end of file diff --git a/quark/cuda_quark_groestl512_sm20.cu b/quark/cuda_quark_groestl512_sm20.cu index f368594..d7b5f8d 100644 --- a/quark/cuda_quark_groestl512_sm20.cu +++ b/quark/cuda_quark_groestl512_sm20.cu @@ -52,7 +52,7 @@ extern uint32_t T2dn_cpu[]; extern uint32_t T3up_cpu[]; extern uint32_t T3dn_cpu[]; -#if __CUDA_ARCH__ < 300 +#if __CUDA_ARCH__ < 300 || defined(_DEBUG) __device__ __forceinline__ void quark_groestl512_perm_P(uint32_t *a, char *mixtabs) @@ -206,7 +206,7 @@ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs) __global__ void quark_groestl512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) { -#if __CUDA_ARCH__ < 300 +#if __CUDA_ARCH__ < 300 || defined(_DEBUG) extern __shared__ char mixtabs[]; if (threadIdx.x < 256) diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index 8de252b..4241d0b 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -13,7 +13,13 @@ extern "C" #include "cuda_helper.h" #include "cuda_quark.h" +#include + +extern uint32_t quark_filter_cpu_sm2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2); +extern void quark_merge_cpu_sm2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2); + static uint32_t *d_hash[MAX_GPUS]; +static uint32_t* d_hash_br2[MAX_GPUS]; // SM 2 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes static uint32_t *d_branch1Nonces[MAX_GPUS]; @@ -102,10 +108,10 @@ extern "C" void quarkhash(void *state, const void *input) #define TRACE(algo) { \ if (max_nonce == 1 && pdata[19] <= 1) { \ uint32_t* debugbuf = NULL; \ - cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \ - cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \ - printf("quark %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ - swab32(debugbuf[2]), swab32(debugbuf[3])); \ + cudaMallocHost(&debugbuf, 32); \ + cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \ + printf("quark %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ + swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \ cudaFreeHost(debugbuf); \ } \ } @@ -121,9 +127,10 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - int dev_id = device_map[thr_id]; - uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); // 256*4096 + int dev_id = device_map[thr_id]; + uint32_t def_thr = 1U << 20; // 256*4096 + uint32_t throughput = cuda_default_throughput(thr_id, def_thr); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) @@ -131,7 +138,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); cudaGetLastError(); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); @@ -142,20 +149,19 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, quark_bmw512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput); - cuda_check_cpu_init(thr_id, throughput); quark_compactTest_cpu_init(thr_id, throughput); - cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); - CUDA_SAFE_CALL(cudaGetLastError()); - - if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { - gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!"); - proper_exit(1); - return -1; + if (cuda_arch[dev_id] >= 300) { + cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); + cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); + cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); + } else { + cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput); } + cuda_check_cpu_init(thr_id, throughput); + CUDA_SAFE_CALL(cudaGetLastError()); + init[thr_id] = true; } @@ -167,58 +173,95 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, do { int order = 0; + uint32_t foundNonce; uint32_t nrm1=0, nrm2=0, nrm3=0; quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; TRACE("blake :"); - - // das ist der unbedingte Branch für BMW512 quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("bmw :"); - quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, - d_branch3Nonces[thr_id], &nrm3, + if (cuda_arch[dev_id] >= 300) { + + quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, + d_branch3Nonces[thr_id], &nrm3, order++); + + // nur den Skein Branch weiterverfolgen + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für Groestl512 + quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für JH512 + quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) + quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, order++); - // nur den Skein Branch weiterverfolgen - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + // das ist der bedingte Branch für Blake512 + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - // das ist der unbedingte Branch für Groestl512 - quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + // das ist der bedingte Branch für Bmw512 + quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); - // das ist der unbedingte Branch für JH512 - quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + // das ist der unbedingte Branch für Keccak512 + quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) - quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + // das ist der unbedingte Branch für Skein512 + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - // das ist der bedingte Branch für Blake512 - quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) + quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); - // das ist der bedingte Branch für Bmw512 - quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); - // das ist der unbedingte Branch für Keccak512 - quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - // das ist der unbedingte Branch für Skein512 - quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + } else { + /* algo permutations are made with 2 different buffers */ - // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) - quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + TRACE("perm1 :"); - quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("groestl:"); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("jh512 :"); + + quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + TRACE("perm2 :"); + + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("keccak :"); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); + + quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + TRACE("perm3 :"); + + CUDA_LOG_ERROR(); + foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + } *hashes_done = pdata[19] - first_nonce + 1; - uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - if (foundNonce != UINT32_MAX) + if (foundNonce != UINT32_MAX) { uint32_t vhash[8]; be32enc(&endiandata[19], foundNonce); diff --git a/util.cpp b/util.cpp index 9bba3bd..3b4c218 100644 --- a/util.cpp +++ b/util.cpp @@ -1852,7 +1852,6 @@ void do_gpu_tests(void) #ifdef _DEBUG unsigned long done; char s[128] = { '\0' }; - uchar buf[160]; struct work work; memset(&work, 0, sizeof(work)); @@ -1866,7 +1865,7 @@ void do_gpu_tests(void) //scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv); memset(work.data, 0, sizeof(work.data)); - scanhash_lyra2(0, &work, 1, &done); + scanhash_quark(0, &work, 1, &done); free(work_restart); work_restart = NULL;