|
|
@ -1,3 +1,7 @@ |
|
|
|
|
|
|
|
/* |
|
|
|
|
|
|
|
* REQUIRE SM 3.0 arch! |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
#include <stdio.h> |
|
|
|
#include <stdio.h> |
|
|
|
#include <memory.h> |
|
|
|
#include <memory.h> |
|
|
|
|
|
|
|
|
|
|
@ -10,6 +14,14 @@ static uint32_t *h_numValid[MAX_GPUS]; |
|
|
|
|
|
|
|
|
|
|
|
static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen |
|
|
|
static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if __CUDA_ARCH__ < 300 |
|
|
|
|
|
|
|
/** |
|
|
|
|
|
|
|
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 |
|
|
|
|
|
|
|
*/ |
|
|
|
|
|
|
|
#undef __shfl_up |
|
|
|
|
|
|
|
#define __shfl_up(var, delta, width) (0) |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
// True/False tester |
|
|
|
// True/False tester |
|
|
|
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); |
|
|
|
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); |
|
|
|
|
|
|
|
|
|
|
@ -28,7 +40,8 @@ __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_Qua |
|
|
|
cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS]; |
|
|
|
cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS]; |
|
|
|
|
|
|
|
|
|
|
|
// Setup/Alloc Function |
|
|
|
// Setup/Alloc Function |
|
|
|
__host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) |
|
|
|
__host__ |
|
|
|
|
|
|
|
void quark_compactTest_cpu_init(int thr_id, uint32_t threads) |
|
|
|
{ |
|
|
|
{ |
|
|
|
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); |
|
|
|
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); |
|
|
|
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); |
|
|
|
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); |
|
|
@ -46,7 +59,8 @@ __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Because all alloc should have a free... |
|
|
|
// Because all alloc should have a free... |
|
|
|
__host__ void quark_compactTest_cpu_free(int thr_id) |
|
|
|
__host__ |
|
|
|
|
|
|
|
void quark_compactTest_cpu_free(int thr_id) |
|
|
|
{ |
|
|
|
{ |
|
|
|
cudaFree(d_tempBranch1Nonces[thr_id]); |
|
|
|
cudaFree(d_tempBranch1Nonces[thr_id]); |
|
|
|
cudaFree(d_numValid[thr_id]); |
|
|
|
cudaFree(d_numValid[thr_id]); |
|
|
@ -57,16 +71,9 @@ __host__ void quark_compactTest_cpu_free(int thr_id) |
|
|
|
cudaFreeHost(h_numValid[thr_id]); |
|
|
|
cudaFreeHost(h_numValid[thr_id]); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#if __CUDA_ARCH__ < 300 |
|
|
|
__global__ |
|
|
|
/** |
|
|
|
void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, |
|
|
|
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 |
|
|
|
uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
|
|
|
*/ |
|
|
|
|
|
|
|
#undef __shfl_up |
|
|
|
|
|
|
|
#define __shfl_up(var, delta, width) (0) |
|
|
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Die Summenfunktion (vom NVIDIA SDK) |
|
|
|
|
|
|
|
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
extern __shared__ uint32_t sums[]; |
|
|
|
extern __shared__ uint32_t sums[]; |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
@ -91,19 +98,16 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * |
|
|
|
{ |
|
|
|
{ |
|
|
|
// keine Nonce-Liste |
|
|
|
// keine Nonce-Liste |
|
|
|
inpHash = &inpHashes[id<<4]; |
|
|
|
inpHash = &inpHashes[id<<4]; |
|
|
|
}else |
|
|
|
} else { |
|
|
|
{ |
|
|
|
|
|
|
|
// Nonce-Liste verfügbar |
|
|
|
// Nonce-Liste verfügbar |
|
|
|
int nonce = d_validNonceTable[id] - startNounce; |
|
|
|
int nonce = d_validNonceTable[id] - startNounce; |
|
|
|
inpHash = &inpHashes[nonce<<4]; |
|
|
|
inpHash = &inpHashes[nonce<<4]; |
|
|
|
} |
|
|
|
} |
|
|
|
value = (*testFunc)(inpHash); |
|
|
|
value = (*testFunc)(inpHash); |
|
|
|
}else |
|
|
|
} else { |
|
|
|
{ |
|
|
|
|
|
|
|
value = 0; |
|
|
|
value = 0; |
|
|
|
} |
|
|
|
} |
|
|
|
}else |
|
|
|
} else { |
|
|
|
{ |
|
|
|
|
|
|
|
value = data[id]; |
|
|
|
value = data[id]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -115,8 +119,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * |
|
|
|
// those threads where the thread 'i' away would have |
|
|
|
// those threads where the thread 'i' away would have |
|
|
|
// been out of bounds of the warp are unaffected. This |
|
|
|
// been out of bounds of the warp are unaffected. This |
|
|
|
// creates the scan sum. |
|
|
|
// creates the scan sum. |
|
|
|
#pragma unroll |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
for (int i=1; i<=width; i*=2) |
|
|
|
for (int i=1; i<=width; i*=2) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint32_t n = __shfl_up((int)value, i, width); |
|
|
|
uint32_t n = __shfl_up((int)value, i, width); |
|
|
@ -147,8 +151,7 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * |
|
|
|
for (int i=1; i<=width; i*=2) |
|
|
|
for (int i=1; i<=width; i*=2) |
|
|
|
{ |
|
|
|
{ |
|
|
|
uint32_t n = __shfl_up((int)warp_sum, i, width); |
|
|
|
uint32_t n = __shfl_up((int)warp_sum, i, width); |
|
|
|
|
|
|
|
if (lane_id >= i) warp_sum += n; |
|
|
|
if (lane_id >= i) warp_sum += n; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
sums[lane_id] = warp_sum; |
|
|
|
sums[lane_id] = warp_sum; |
|
|
@ -178,7 +181,8 @@ __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t * |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Uniform add: add partial sums array |
|
|
|
// Uniform add: add partial sums array |
|
|
|
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) |
|
|
|
__global__ |
|
|
|
|
|
|
|
void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) |
|
|
|
{ |
|
|
|
{ |
|
|
|
__shared__ uint32_t buf; |
|
|
|
__shared__ uint32_t buf; |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
@ -195,7 +199,8 @@ __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// Der Scatter |
|
|
|
// Der Scatter |
|
|
|
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
|
|
|
__global__ |
|
|
|
|
|
|
|
void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
|
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); |
|
|
|
uint32_t actNounce = id; |
|
|
|
uint32_t actNounce = id; |
|
|
@ -244,9 +249,9 @@ __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) |
|
|
|
return mask; |
|
|
|
return mask; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm, |
|
|
|
__host__ |
|
|
|
uint32_t *d_nonces1, cuda_compactTestFunction_t function, |
|
|
|
void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,uint32_t *d_nonces1, |
|
|
|
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
|
|
|
cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
|
|
|
{ |
|
|
|
{ |
|
|
|
int orgThreads = threads; |
|
|
|
int orgThreads = threads; |
|
|
|
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); |
|
|
|
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads); |
|
|
@ -300,9 +305,9 @@ __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t thread |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) |
|
|
|
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) |
|
|
|
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, |
|
|
|
__host__ |
|
|
|
uint32_t *d_nonces1, uint32_t *d_nonces2, |
|
|
|
void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1, |
|
|
|
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
|
|
|
uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) |
|
|
|
{ |
|
|
|
{ |
|
|
|
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
|
|
|
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
|
|
|
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
|
|
|
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); |
|
|
@ -339,10 +344,9 @@ __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
|
|
|
__host__ |
|
|
|
uint32_t *d_nonces1, uint32_t *nrm1, |
|
|
|
void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, |
|
|
|
uint32_t *d_nonces2, uint32_t *nrm2, |
|
|
|
uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order) |
|
|
|
int order) |
|
|
|
|
|
|
|
{ |
|
|
|
{ |
|
|
|
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
|
|
|
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
|
|
|
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
|
|
|
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
|
|
@ -356,9 +360,9 @@ __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32 |
|
|
|
*nrm2 = h_numValid[thr_id][1]; |
|
|
|
*nrm2 = h_numValid[thr_id][1]; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, |
|
|
|
__host__ |
|
|
|
uint32_t *d_nonces1, uint32_t *nrm1, |
|
|
|
void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, |
|
|
|
int order) |
|
|
|
uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, int order) |
|
|
|
{ |
|
|
|
{ |
|
|
|
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
|
|
|
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind |
|
|
|
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
|
|
|
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! |
|
|
|