From be044f31a1df3baa05857018ba8525700743159f Mon Sep 17 00:00:00 2001 From: Christian Buchner Date: Mon, 5 May 2014 21:15:48 +0200 Subject: [PATCH] Speed optimization: do MUCH LESS Groestl in Jackpot, throw away 75% of the hashes. More speed ;-) --- JHA/cuda_jha_compactionTest.cu | 257 +++++++++++++++++++++++---------- JHA/jackpotcoin.cu | 103 +++++++++---- Makefile.am | 5 +- ccminer.vcxproj | 130 ++++++++++++++--- 4 files changed, 363 insertions(+), 132 deletions(-) diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu index bccbe12..e043e2c 100644 --- a/JHA/cuda_jha_compactionTest.cu +++ b/JHA/cuda_jha_compactionTest.cu @@ -14,69 +14,54 @@ extern int device_map[8]; static cudaDeviceProp props[8]; static uint32_t *d_tempBranch1Nonces[8]; -static uint32_t *d_tempBranch2Nonces[8]; -static size_t *d_numValid[8]; -static size_t *h_numValid[8]; +static uint32_t *d_numValid[8]; +static uint32_t *h_numValid[8]; -static uint32_t *d_partSum1[8], *d_partSum2[8]; // 2x partielle summen -static uint32_t *d_validTemp1[8], *d_validTemp2[8]; - -// Zwischenspeicher -static uint32_t *d_tempBranchAllNonces[8]; +static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); +// True/False tester +typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); + +__device__ uint32_t JackpotTrueTest(uint32_t *inpHash) +{ + uint32_t tmp = inpHash[0] & 0x01; + return (tmp == 1); +} + +__device__ uint32_t JackpotFalseTest(uint32_t *inpHash) +{ + uint32_t tmp = inpHash[0] & 0x01; + return (tmp == 0); +} +__device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d_JackpotFalseFunction = JackpotFalseTest; +cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8]; // Setup-Funktionen __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) { cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); + cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t)); + cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t)); + // wir brauchen auch Speicherplatz auf dem Device - cudaMalloc(&d_tempBranchAllNonces[thr_id], sizeof(uint32_t) * threads); - cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads); - cudaMalloc(&d_tempBranch2Nonces[thr_id], sizeof(uint32_t) * threads); - cudaMalloc(&d_numValid[thr_id], 2*sizeof(size_t)); - cudaMallocHost(&h_numValid[thr_id], 2*sizeof(size_t)); + cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); + cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); + cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); uint32_t s1; - s1 = threads / 256; + s1 = (threads / 256) * 2; - cudaMalloc(&d_partSum1[thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) - cudaMalloc(&d_partSum2[thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) - - cudaMalloc(&d_validTemp1[thr_id], sizeof(uint32_t) * threads); // BLOCKSIZE (Threads/Block) - cudaMalloc(&d_validTemp2[thr_id], sizeof(uint32_t) * threads); // BLOCKSIZE (Threads/Block) -} - -// Die Testfunktion (zum Erstellen der TestMap) -__global__ void jackpot_compactTest_gpu_TEST_64(int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesFull, - uint32_t *d_nonces1, uint32_t *d_nonces2, - uint32_t *d_validT1, uint32_t *d_validT2) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // bestimme den aktuellen Zähler - uint32_t nounce = startNounce + thread; - uint32_t *inpHash = &inpHashes[16 * thread]; - - uint32_t tmp = inpHash[0] & 0x01; - uint32_t val1 = (tmp == 1); - uint32_t val2 = (tmp == 0); - - d_nonces1[thread] = val1; - d_validT1[thread] = val1; - d_nonces2[thread] = val2; - d_validT2[thread] = val2; - d_noncesFull[thread] = nounce; - } + cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) + cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) } // Die Summenfunktion (vom NVIDIA SDK) -__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL) +__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { extern __shared__ uint32_t sums[]; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); @@ -86,10 +71,38 @@ __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t //int warp_id = threadIdx.x / warpSize; int warp_id = threadIdx.x / width; + sums[lane_id] = 0; + // Below is the basic structure of using a shfl instruction // for a scan. // Record "value" as a variable - we accumulate it along the way - uint32_t value = data[id]; + uint32_t value; + if(testFunc != NULL) + { + if (id < threads) + { + uint32_t *inpHash; + if(d_validNonceTable == NULL) + { + // keine Nonce-Liste + inpHash = &inpHashes[id<<4]; + }else + { + // Nonce-Liste verfügbar + int nonce = d_validNonceTable[id] - startNounce; + inpHash = &inpHashes[nonce<<4]; + } + value = (*testFunc)(inpHash); + }else + { + value = 0; + } + }else + { + value = data[id]; + } + + __syncthreads(); // Now accumulate in log steps up the chain // compute sums, with another thread's value who is @@ -177,39 +190,137 @@ __global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_su } // Der Scatter -__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *data, uint32_t *valid, uint32_t *sum, uint32_t *outp) +__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { int id = ((blockIdx.x * blockDim.x) + threadIdx.x); - if( valid[id] ) + uint32_t actNounce = id; + uint32_t value; + if (id < threads) + { +// uint32_t nounce = startNounce + id; + uint32_t *inpHash; + if(d_validNonceTable == NULL) + { + // keine Nonce-Liste + inpHash = &inpHashes[id<<4]; + }else + { + // Nonce-Liste verfügbar + int nonce = d_validNonceTable[id] - startNounce; + actNounce = nonce; + inpHash = &inpHashes[nonce<<4]; + } + + value = (*testFunc)(inpHash); + }else + { + value = 0; + } + + if( value ) { int idx = sum[id]; if(idx > 0) - outp[idx-1] = data[id]; + outp[idx-1] = startNounce + actNounce; + } +} + +__host__ static uint32_t jackpot_compactTest_roundUpExp(uint32_t val) +{ + if(val == 0) + return 0; + + uint32_t mask = 0x80000000; + while( (val & mask) == 0 ) mask = mask >> 1; + + if( (val & (~mask)) != 0 ) + return mask << 1; + + return mask; +} + +__host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, + uint32_t *d_nonces1, cuda_compactTestFunction_t function, + uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) +{ + int orgThreads = threads; + threads = (int)jackpot_compactTest_roundUpExp((uint32_t)threads); + // threadsPerBlock ausrechnen + int blockSize = 256; + int nSummen = threads / blockSize; + + int thr1 = (threads+blockSize-1) / blockSize; + int thr2 = threads / (blockSize*blockSize); + int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize; + int thr3 = (nSummen + blockSize2-1) / blockSize2; + + bool callThrid = (thr2 > 0) ? true : false; + + // Erster Initialscan + jackpot_compactTest_gpu_SCAN<<>>( + d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); + + // weitere Scans + if(callThrid) + { + jackpot_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); + jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); + }else + { + jackpot_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); + } + + // Sync + Anzahl merken + cudaStreamSynchronize(NULL); + + if(callThrid) + cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + else + cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + + // Addieren + if(callThrid) + { + jackpot_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); } + jackpot_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); + + // Scatter + jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, + function, orgThreads, startNounce, inpHashes, d_validNonceTable); + + // Sync + cudaStreamSynchronize(NULL); } ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) -__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, size_t *nrm, - uint32_t *d_nonces1, uint32_t *d_nonces2) +__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, + uint32_t *d_nonces1, uint32_t *d_nonces2, + uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) { + jackpot_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_JackpotTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); + jackpot_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_JackpotFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); + + /* // threadsPerBlock ausrechnen int blockSize = 256; int thr1 = threads / blockSize; int thr2 = threads / (blockSize*blockSize); // 1 - jackpot_compactTest_gpu_SCAN<<>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id]); - jackpot_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); - jackpot_compactTest_gpu_SCAN<<<1, thr2, 8*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); + jackpot_compactTest_gpu_SCAN<<>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes); + jackpot_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); + jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); cudaStreamSynchronize(NULL); cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); jackpot_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); jackpot_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); // 2 - jackpot_compactTest_gpu_SCAN<<>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id]); - jackpot_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); - jackpot_compactTest_gpu_SCAN<<<1, thr2, 8*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); + jackpot_compactTest_gpu_SCAN<<>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes); + jackpot_compactTest_gpu_SCAN<<>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); + jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); cudaStreamSynchronize(NULL); cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); jackpot_compactTest_gpu_ADD<<>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); @@ -217,39 +328,25 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, si // Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten // Schritt 3: Scatter - jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranchAllNonces[thr_id], d_validTemp1[thr_id], d_tempBranch1Nonces[thr_id], d_nonces1); - jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranchAllNonces[thr_id], d_validTemp2[thr_id], d_tempBranch2Nonces[thr_id], d_nonces2); + jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes); + jackpot_compactTest_gpu_SCATTER<<>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes); cudaStreamSynchronize(NULL); + */ } -__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, +__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces2, size_t *nrm2, int order) { - // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, - // alle anderen mit 512 Threads. - //int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512; - int threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - size_t shared_size = 0; - -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - // Schritt 1: Prüfen der Bedingung und Speicherung in d_tempBranch1/2Nonces - jackpot_compactTest_gpu_TEST_64<<>>(threads, startNounce, inpHashes, d_tempBranchAllNonces[thr_id], - d_tempBranch1Nonces[thr_id], d_tempBranch2Nonces[thr_id], - d_validTemp1[thr_id], d_validTemp2[thr_id]); + // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind + // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! - // Strategisches Sleep Kommando zur Senkung der CPU Last jackpot_compactTest_cpu_dualCompaction(thr_id, threads, - h_numValid[thr_id], d_nonces1, d_nonces2); + h_numValid[thr_id], d_nonces1, d_nonces2, + startNounce, inpHashes, d_validNonceTable); cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser - *nrm1 = h_numValid[thr_id][0]; - *nrm2 = h_numValid[thr_id][1]; + *nrm1 = (size_t)h_numValid[thr_id][0]; + *nrm2 = (size_t)h_numValid[thr_id][1]; } diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index 3249ccd..e676613 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -13,7 +13,6 @@ extern "C" // aus cpu-miner.c extern int device_map[8]; -extern bool opt_benchmark; // Speicher für Input/Output der verketteten Hashfunktionen static uint32_t *d_hash[8]; @@ -39,7 +38,7 @@ extern void quark_check_cpu_setTarget(const void *ptarget); extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); extern void jackpot_compactTest_cpu_init(int thr_id, int threads); -extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, +extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces1, size_t *nrm1, uint32_t *d_nonces2, size_t *nrm2, int order); @@ -48,6 +47,7 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st static uint32_t *d_jackpotNonces[8]; static uint32_t *d_branch1Nonces[8]; static uint32_t *d_branch2Nonces[8]; +static uint32_t *d_branch3Nonces[8]; // Original jackpothash Funktion aus einem miner Quelltext inline unsigned int jackpothash(void *state, const void *input) @@ -93,6 +93,8 @@ inline unsigned int jackpothash(void *state, const void *input) } +extern bool opt_benchmark; + extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) @@ -105,7 +107,8 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, const uint32_t Htarg = ptarget[7]; - const int throughput = 256*4096; // 100; + const int throughput = 256*4096*4; // 100; + //const int throughput = 256*256*2+100; // 100; static bool init[8] = {0,0,0,0,0,0,0,0}; if (!init[thr_id]) @@ -121,9 +124,10 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, quark_jh512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); quark_check_cpu_init(thr_id, throughput); - cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); - cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); + cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2); + cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2); + cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2); + cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput*2); init[thr_id] = true; } @@ -140,35 +144,77 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, // erstes Keccak512 Hash mit CUDA jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - for (int round=0; round < 3; round++) - { - size_t nrm1, nrm2; + size_t nrm1, nrm2, nrm3; + + // Runde 1 (ohne Gröstl) - // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) - jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], + jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, + d_branch3Nonces[thr_id], &nrm3, order++); - if (nrm1+nrm2 == throughput) { - quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); - } + // verfolge den skein-pfad weiter + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); - // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) - jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], - d_branch1Nonces[thr_id], &nrm1, - d_branch2Nonces[thr_id], &nrm2, - order++); + // noch schnell Blake & JH + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); - if (nrm1+nrm2 == throughput) { - quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); - quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); - } + if (nrm1+nrm2 == nrm3) { + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + } + + // Runde 2 (ohne Gröstl) + + // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch3Nonces[thr_id], &nrm3, + order++); + + // verfolge den skein-pfad weiter + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); + + if (nrm1+nrm2 == nrm3) { + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + } + + // Runde 3 (komplett) + + // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); + + if (nrm1+nrm2 == nrm3) { + quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + } + + // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) + jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); + + if (nrm1+nrm2 == nrm3) { + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); } // Scan nach Gewinner Hashes auf der GPU - uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; @@ -180,7 +226,8 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { pdata[19] = foundNonce; - *hashes_done = (foundNonce - first_nonce + 1); + *hashes_done = (foundNonce - first_nonce + 1)/4; + //applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds); return 1; } else { applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds); @@ -191,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); - *hashes_done = (pdata[19] - first_nonce + 1); + *hashes_done = (pdata[19] - first_nonce + 1)/4; return 0; } diff --git a/Makefile.am b/Makefile.am index 390e525..ec36cf6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -40,6 +40,7 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f .cu.o: $(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< +# ABI requiring code modules # this module doesn't compile with Compute 2.0 unfortunately -JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu - $(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< +JHA/cuda_jackpot_compactionTest.o: JHA/cuda_jackpot_compactionTest.cu + $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 32094af..5b3ad5c 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -103,7 +103,6 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" compute_35,sm_35 - -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -135,7 +134,6 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" compute_35,sm_35 - -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -171,7 +169,6 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" compute_35,sm_35 - -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -207,7 +204,6 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" compute_35,sm_35 - -Xptxas "-abi=no -v" %(AdditionalOptions) @@ -291,24 +287,114 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - - - - - - - - - - - - - - - - - + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=yes -v" %(AdditionalOptions) + -Xptxas "-abi=yes -v" %(AdditionalOptions) + -Xptxas "-abi=yes -v" %(AdditionalOptions) + -Xptxas "-abi=yes -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + + + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) + -Xptxas "-abi=no -v" %(AdditionalOptions) +