From d7c2168f2b4b5b25789502b8540041f366a308b9 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 6 Nov 2015 15:03:36 +0100 Subject: [PATCH] quark: static shared memory allocation for SM3+ from KlausT committed on 4 Jan, add a few kH/s --- README.txt | 2 +- quark/cuda_quark_compactionTest.cu | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.txt b/README.txt index 0723f05..996689f 100644 --- a/README.txt +++ b/README.txt @@ -228,7 +228,7 @@ features. >>> RELEASE HISTORY <<< - Nov. 05th 2015 v1.7 + Nov. 06th 2015 v1.7 Improve old devices compatibility (x11, lyra2, qubit...) Add windows support for SM 2.1 and drop SM 3.5 (x86) Improve lyra2 (v1/v2) cuda implementations diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu index 96c0afd..3c9137e 100644 --- a/quark/cuda_quark_compactionTest.cu +++ b/quark/cuda_quark_compactionTest.cu @@ -79,7 +79,7 @@ void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *parti uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { #if __CUDA_ARCH__ >= 300 - extern __shared__ uint32_t sums[]; + __shared__ uint32_t sums[32]; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); //int lane_id = id % warpSize; int lane_id = id % width; @@ -267,15 +267,15 @@ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32 bool callThrid = (thr2 > 0) ? true : false; // Erster Initialscan - quark_compactTest_gpu_SCAN<<>>( + quark_compactTest_gpu_SCAN <<>>( d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); // weitere Scans if(callThrid) { - quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); - quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); + quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); + quark_compactTest_gpu_SCAN<<<1, thr2>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); } else { - quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); + quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); } // Sync + Anzahl merken