|
|
@ -14,69 +14,54 @@ extern int device_map[8]; |
|
|
|
static cudaDeviceProp props[8]; |
|
|
|
static cudaDeviceProp props[8]; |
|
|
|
|
|
|
|
|
|
|
|
static uint32_t *d_tempBranch1Nonces[8]; |
|
|
|
static uint32_t *d_tempBranch1Nonces[8]; |
|
|
|
static uint32_t *d_tempBranch2Nonces[8]; |
|
|
|
static uint32_t *d_numValid[8]; |
|
|
|
static size_t *d_numValid[8]; |
|
|
|
static uint32_t *h_numValid[8]; |
|
|
|
static size_t *h_numValid[8]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static uint32_t *d_partSum1[8], *d_partSum2[8]; // 2x partielle summen |
|
|
|
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen |
|
|
|
static uint32_t *d_validTemp1[8], *d_validTemp2[8]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Zwischenspeicher |
|
|
|
|
|
|
|
static uint32_t *d_tempBranchAllNonces[8]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// aus heavy.cu |
|
|
|
// aus heavy.cu |
|
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
|
|
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// True/False tester |
|
|
|
|
|
|
|
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ uint32_t JackpotTrueTest(uint32_t *inpHash) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint32_t tmp = inpHash[0] & 0x01; |
|
|
|
|
|
|
|
return (tmp == 1); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ uint32_t JackpotFalseTest(uint32_t *inpHash) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint32_t tmp = inpHash[0] & 0x01; |
|
|
|
|
|
|
|
return (tmp == 0); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ cuda_compactTestFunction_t d_JackpotTrueFunction = JackpotTrueTest, d_JackpotFalseFunction = JackpotFalseTest; |
|
|
|
|
|
|
|
cuda_compactTestFunction_t h_JackpotTrueFunction[8], h_JackpotFalseFunction[8]; |
|
|
|
|
|
|
|
|
|
|
|
// Setup-Funktionen |
|
|
|
// Setup-Funktionen |
|
|
|
__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) |
|
|
|
__host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) |
|
|
|
{ |
|
|
|
{ |
|
|
|
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); |
|
|
|
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cudaMemcpyFromSymbol(&h_JackpotTrueFunction[thr_id], d_JackpotTrueFunction, sizeof(cuda_compactTestFunction_t)); |
|
|
|
|
|
|
|
cudaMemcpyFromSymbol(&h_JackpotFalseFunction[thr_id], d_JackpotFalseFunction, sizeof(cuda_compactTestFunction_t)); |
|
|
|
|
|
|
|
|
|
|
|
// wir brauchen auch Speicherplatz auf dem Device |
|
|
|
// wir brauchen auch Speicherplatz auf dem Device |
|
|
|
cudaMalloc(&d_tempBranchAllNonces[thr_id], sizeof(uint32_t) * threads); |
|
|
|
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2); |
|
|
|
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads); |
|
|
|
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t)); |
|
|
|
cudaMalloc(&d_tempBranch2Nonces[thr_id], sizeof(uint32_t) * threads); |
|
|
|
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); |
|
|
|
cudaMalloc(&d_numValid[thr_id], 2*sizeof(size_t)); |
|
|
|
|
|
|
|
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(size_t)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t s1; |
|
|
|
uint32_t s1; |
|
|
|
s1 = threads / 256; |
|
|
|
s1 = (threads / 256) * 2; |
|
|
|
|
|
|
|
|
|
|
|
cudaMalloc(&d_partSum1[thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
|
|
|
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
|
|
|
cudaMalloc(&d_partSum2[thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
|
|
|
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) |
|
|
|
|
|
|
|
|
|
|
|
cudaMalloc(&d_validTemp1[thr_id], sizeof(uint32_t) * threads); // BLOCKSIZE (Threads/Block) |
|
|
|
|
|
|
|
cudaMalloc(&d_validTemp2[thr_id], sizeof(uint32_t) * threads); // BLOCKSIZE (Threads/Block) |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Die Testfunktion (zum Erstellen der TestMap) |
|
|
|
|
|
|
|
__global__ void jackpot_compactTest_gpu_TEST_64(int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesFull, |
|
|
|
|
|
|
|
uint32_t *d_nonces1, uint32_t *d_nonces2, |
|
|
|
|
|
|
|
uint32_t *d_validT1, uint32_t *d_validT2) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
|
|
|
|
if (thread < threads) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// bestimme den aktuellen Zähler |
|
|
|
|
|
|
|
uint32_t nounce = startNounce + thread; |
|
|
|
|
|
|
|
uint32_t *inpHash = &inpHashes[16 * thread]; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t tmp = inpHash[0] & 0x01; |
|
|
|
|
|
|
|
uint32_t val1 = (tmp == 1); |
|
|
|
|
|
|
|
uint32_t val2 = (tmp == 0); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
d_nonces1[thread] = val1; |
|
|
|
|
|
|
|
d_validT1[thread] = val1; |
|
|
|
|
|
|
|
d_nonces2[thread] = val2; |
|
|
|
|
|
|
|
d_validT2[thread] = val2; |
|
|
|
|
|
|
|
d_noncesFull[thread] = nounce; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Die Summenfunktion (vom NVIDIA SDK) |
|
|
|
// Die Summenfunktion (vom NVIDIA SDK) |
|
|
|
__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL) |
|
|
|
__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
|
|
|
{ |
|
|
|
{ |
|
|
|
extern __shared__ uint32_t sums[]; |
|
|
|
extern __shared__ uint32_t sums[]; |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
@ -86,10 +71,38 @@ __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t |
|
|
|
//int warp_id = threadIdx.x / warpSize; |
|
|
|
//int warp_id = threadIdx.x / warpSize; |
|
|
|
int warp_id = threadIdx.x / width; |
|
|
|
int warp_id = threadIdx.x / width; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sums[lane_id] = 0; |
|
|
|
|
|
|
|
|
|
|
|
// Below is the basic structure of using a shfl instruction |
|
|
|
// Below is the basic structure of using a shfl instruction |
|
|
|
// for a scan. |
|
|
|
// for a scan. |
|
|
|
// Record "value" as a variable - we accumulate it along the way |
|
|
|
// Record "value" as a variable - we accumulate it along the way |
|
|
|
uint32_t value = data[id]; |
|
|
|
uint32_t value; |
|
|
|
|
|
|
|
if(testFunc != NULL) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if (id < threads) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
uint32_t *inpHash; |
|
|
|
|
|
|
|
if(d_validNonceTable == NULL) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// keine Nonce-Liste |
|
|
|
|
|
|
|
inpHash = &inpHashes[id<<4]; |
|
|
|
|
|
|
|
}else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// Nonce-Liste verfügbar |
|
|
|
|
|
|
|
int nonce = d_validNonceTable[id] - startNounce; |
|
|
|
|
|
|
|
inpHash = &inpHashes[nonce<<4]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
value = (*testFunc)(inpHash); |
|
|
|
|
|
|
|
}else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
value = 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
value = data[id]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__syncthreads(); |
|
|
|
|
|
|
|
|
|
|
|
// Now accumulate in log steps up the chain |
|
|
|
// Now accumulate in log steps up the chain |
|
|
|
// compute sums, with another thread's value who is |
|
|
|
// compute sums, with another thread's value who is |
|
|
@ -177,39 +190,137 @@ __global__ void jackpot_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_su |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Der Scatter |
|
|
|
// Der Scatter |
|
|
|
__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *data, uint32_t *valid, uint32_t *sum, uint32_t *outp) |
|
|
|
__global__ void jackpot_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
|
if( valid[id] ) |
|
|
|
uint32_t actNounce = id; |
|
|
|
|
|
|
|
uint32_t value; |
|
|
|
|
|
|
|
if (id < threads) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// uint32_t nounce = startNounce + id; |
|
|
|
|
|
|
|
uint32_t *inpHash; |
|
|
|
|
|
|
|
if(d_validNonceTable == NULL) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// keine Nonce-Liste |
|
|
|
|
|
|
|
inpHash = &inpHashes[id<<4]; |
|
|
|
|
|
|
|
}else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
// Nonce-Liste verfügbar |
|
|
|
|
|
|
|
int nonce = d_validNonceTable[id] - startNounce; |
|
|
|
|
|
|
|
actNounce = nonce; |
|
|
|
|
|
|
|
inpHash = &inpHashes[nonce<<4]; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
value = (*testFunc)(inpHash); |
|
|
|
|
|
|
|
}else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
value = 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if( value ) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int idx = sum[id]; |
|
|
|
int idx = sum[id]; |
|
|
|
if(idx > 0) |
|
|
|
if(idx > 0) |
|
|
|
outp[idx-1] = data[id]; |
|
|
|
outp[idx-1] = startNounce + actNounce; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__host__ static uint32_t jackpot_compactTest_roundUpExp(uint32_t val) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
if(val == 0) |
|
|
|
|
|
|
|
return 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t mask = 0x80000000; |
|
|
|
|
|
|
|
while( (val & mask) == 0 ) mask = mask >> 1; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if( (val & (~mask)) != 0 ) |
|
|
|
|
|
|
|
return mask << 1; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return mask; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm, |
|
|
|
|
|
|
|
uint32_t *d_nonces1, cuda_compactTestFunction_t function, |
|
|
|
|
|
|
|
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
int orgThreads = threads; |
|
|
|
|
|
|
|
threads = (int)jackpot_compactTest_roundUpExp((uint32_t)threads); |
|
|
|
|
|
|
|
// threadsPerBlock ausrechnen |
|
|
|
|
|
|
|
int blockSize = 256; |
|
|
|
|
|
|
|
int nSummen = threads / blockSize; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int thr1 = (threads+blockSize-1) / blockSize; |
|
|
|
|
|
|
|
int thr2 = threads / (blockSize*blockSize); |
|
|
|
|
|
|
|
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize; |
|
|
|
|
|
|
|
int thr3 = (nSummen + blockSize2-1) / blockSize2; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool callThrid = (thr2 > 0) ? true : false; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Erster Initialscan |
|
|
|
|
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>( |
|
|
|
|
|
|
|
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// weitere Scans |
|
|
|
|
|
|
|
if(callThrid) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); |
|
|
|
|
|
|
|
jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); |
|
|
|
|
|
|
|
}else |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Sync + Anzahl merken |
|
|
|
|
|
|
|
cudaStreamSynchronize(NULL); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(callThrid) |
|
|
|
|
|
|
|
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
|
|
|
|
else |
|
|
|
|
|
|
|
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Addieren |
|
|
|
|
|
|
|
if(callThrid) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Scatter |
|
|
|
|
|
|
|
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, |
|
|
|
|
|
|
|
function, orgThreads, startNounce, inpHashes, d_validNonceTable); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Sync |
|
|
|
|
|
|
|
cudaStreamSynchronize(NULL); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) |
|
|
|
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) |
|
|
|
__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, size_t *nrm, |
|
|
|
__host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm, |
|
|
|
uint32_t *d_nonces1, uint32_t *d_nonces2) |
|
|
|
uint32_t *d_nonces1, uint32_t *d_nonces2, |
|
|
|
|
|
|
|
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
|
|
|
{ |
|
|
|
{ |
|
|
|
|
|
|
|
jackpot_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_JackpotTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
|
|
|
|
|
|
|
jackpot_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_JackpotFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* |
|
|
|
// threadsPerBlock ausrechnen |
|
|
|
// threadsPerBlock ausrechnen |
|
|
|
int blockSize = 256; |
|
|
|
int blockSize = 256; |
|
|
|
int thr1 = threads / blockSize; |
|
|
|
int thr1 = threads / blockSize; |
|
|
|
int thr2 = threads / (blockSize*blockSize); |
|
|
|
int thr2 = threads / (blockSize*blockSize); |
|
|
|
|
|
|
|
|
|
|
|
// 1 |
|
|
|
// 1 |
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 8*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id]); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 8*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<1, thr2, 8*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
|
|
|
cudaStreamSynchronize(NULL); |
|
|
|
cudaStreamSynchronize(NULL); |
|
|
|
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
|
|
|
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
|
|
|
jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); |
|
|
|
jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads); |
|
|
|
|
|
|
|
|
|
|
|
// 2 |
|
|
|
// 2 |
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 8*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id]); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 8*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<1, thr2, 8*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
|
|
|
jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2); |
|
|
|
cudaStreamSynchronize(NULL); |
|
|
|
cudaStreamSynchronize(NULL); |
|
|
|
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
|
|
|
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
|
|
|
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2); |
|
|
@ -217,39 +328,25 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, int threads, si |
|
|
|
|
|
|
|
|
|
|
|
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten |
|
|
|
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten |
|
|
|
// Schritt 3: Scatter |
|
|
|
// Schritt 3: Scatter |
|
|
|
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranchAllNonces[thr_id], d_validTemp1[thr_id], d_tempBranch1Nonces[thr_id], d_nonces1); |
|
|
|
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes); |
|
|
|
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranchAllNonces[thr_id], d_validTemp2[thr_id], d_tempBranch2Nonces[thr_id], d_nonces2); |
|
|
|
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes); |
|
|
|
cudaStreamSynchronize(NULL); |
|
|
|
cudaStreamSynchronize(NULL); |
|
|
|
|
|
|
|
*/ |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, |
|
|
|
__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
|
|
|
uint32_t *d_nonces1, size_t *nrm1, |
|
|
|
uint32_t *d_nonces1, size_t *nrm1, |
|
|
|
uint32_t *d_nonces2, size_t *nrm2, |
|
|
|
uint32_t *d_nonces2, size_t *nrm2, |
|
|
|
int order) |
|
|
|
int order) |
|
|
|
{ |
|
|
|
{ |
|
|
|
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, |
|
|
|
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
|
|
|
// alle anderen mit 512 Threads. |
|
|
|
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
|
|
|
//int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512; |
|
|
|
|
|
|
|
int threadsperblock = 256; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// berechne wie viele Thread Blocks wir brauchen |
|
|
|
|
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
|
|
|
|
dim3 block(threadsperblock); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t shared_size = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Schritt 1: Prüfen der Bedingung und Speicherung in d_tempBranch1/2Nonces |
|
|
|
|
|
|
|
jackpot_compactTest_gpu_TEST_64<<<grid, block, shared_size>>>(threads, startNounce, inpHashes, d_tempBranchAllNonces[thr_id], |
|
|
|
|
|
|
|
d_tempBranch1Nonces[thr_id], d_tempBranch2Nonces[thr_id], |
|
|
|
|
|
|
|
d_validTemp1[thr_id], d_validTemp2[thr_id]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Strategisches Sleep Kommando zur Senkung der CPU Last |
|
|
|
|
|
|
|
jackpot_compactTest_cpu_dualCompaction(thr_id, threads, |
|
|
|
jackpot_compactTest_cpu_dualCompaction(thr_id, threads, |
|
|
|
h_numValid[thr_id], d_nonces1, d_nonces2); |
|
|
|
h_numValid[thr_id], d_nonces1, d_nonces2, |
|
|
|
|
|
|
|
startNounce, inpHashes, d_validNonceTable); |
|
|
|
|
|
|
|
|
|
|
|
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser |
|
|
|
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser |
|
|
|
*nrm1 = h_numValid[thr_id][0]; |
|
|
|
*nrm1 = (size_t)h_numValid[thr_id][0]; |
|
|
|
*nrm2 = h_numValid[thr_id][1]; |
|
|
|
*nrm2 = (size_t)h_numValid[thr_id][1]; |
|
|
|
} |
|
|
|
} |
|
|
|