Browse Source

quark: static shared memory allocation for SM3+

from KlausT committed on 4 Jan, add a few kH/s
master
Tanguy Pruvot 9 years ago
parent
commit
d7c2168f2b
  1. 2
      README.txt
  2. 10
      quark/cuda_quark_compactionTest.cu

2
README.txt

@ -228,7 +228,7 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Nov. 05th 2015 v1.7 Nov. 06th 2015 v1.7
Improve old devices compatibility (x11, lyra2, qubit...) Improve old devices compatibility (x11, lyra2, qubit...)
Add windows support for SM 2.1 and drop SM 3.5 (x86) Add windows support for SM 2.1 and drop SM 3.5 (x86)
Improve lyra2 (v1/v2) cuda implementations Improve lyra2 (v1/v2) cuda implementations

10
quark/cuda_quark_compactionTest.cu

@ -79,7 +79,7 @@ void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *parti
uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
extern __shared__ uint32_t sums[]; __shared__ uint32_t sums[32];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x); int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
//int lane_id = id % warpSize; //int lane_id = id % warpSize;
int lane_id = id % width; int lane_id = id % width;
@ -267,15 +267,15 @@ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32
bool callThrid = (thr2 > 0) ? true : false; bool callThrid = (thr2 > 0) ? true : false;
// Erster Initialscan // Erster Initialscan
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>( quark_compactTest_gpu_SCAN <<<thr1,blockSize>>>(
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// weitere Scans // weitere Scans
if(callThrid) { if(callThrid) {
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); quark_compactTest_gpu_SCAN<<<thr2,blockSize>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); quark_compactTest_gpu_SCAN<<<1, thr2>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
} else { } else {
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); quark_compactTest_gpu_SCAN<<<thr3,blockSize2>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
} }
// Sync + Anzahl merken // Sync + Anzahl merken

Loading…
Cancel
Save