From 0a0fd33caca64ff842a279db1218c041496c3754 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 25 Jul 2016 20:12:53 +0200 Subject: [PATCH] attempt to reduce shared mem errors --- configure.ac | 2 +- x11/c11.cu | 2 +- x11/cuda_x11_echo.cu | 3 +++ x11/cuda_x11_shavite512.cu | 4 ++++ x11/s3.cu | 2 +- x11/sib.cu | 2 +- x11/x11.cu | 2 +- x11/x11evo.cu | 2 +- x13/x13.cu | 2 +- x15/x14.cu | 2 +- x15/x15.cu | 2 +- x17/x17.cu | 2 +- 12 files changed, 17 insertions(+), 10 deletions(-) diff --git a/configure.ac b/configure.ac index d0cc35a..b478d7f 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [1.8], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [1.8.1], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/x11/c11.cu b/x11/c11.cu index ae5068a..5536cc8 100644 --- a/x11/c11.cu +++ b/x11/c11.cu @@ -111,7 +111,7 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u const uint32_t first_nonce = pdata[19]; int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x5; diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index 0e63e72..fa5c4f7 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -3,6 +3,8 @@ #include "cuda_helper.h" +extern __device__ __device_builtin__ void __threadfence_block(void); + #include "cuda_x11_aes.cuh" __device__ __forceinline__ void AES_2ROUND( @@ -284,6 +286,7 @@ void x11_echo512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g __shared__ uint32_t sharedMemory[1024]; echo_gpu_init(sharedMemory); + __threadfence_block(); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index 69dee96..cfebf0d 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -2,6 +2,8 @@ #include "cuda_helper.h" +extern __device__ __device_builtin__ void __threadfence_block(void); + #define TPB 128 __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) @@ -1346,6 +1348,7 @@ void x11_shavite512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t __shared__ uint32_t sharedMemory[1024]; shavite_gpu_init(sharedMemory); + __threadfence_block(); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -1397,6 +1400,7 @@ void x11_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou __shared__ uint32_t sharedMemory[1024]; shavite_gpu_init(sharedMemory); + __threadfence_block(); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) diff --git a/x11/s3.cu b/x11/s3.cu index 97e6068..84f0998 100644 --- a/x11/s3.cu +++ b/x11/s3.cu @@ -72,7 +72,7 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un intensity--; #endif uint32_t throughput = cuda_default_throughput(thr_id, 1 << intensity); - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ptarget[7] = 0xF; diff --git a/x11/sib.cu b/x11/sib.cu index ba61469..28794fb 100644 --- a/x11/sib.cu +++ b/x11/sib.cu @@ -106,7 +106,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u const uint32_t first_nonce = pdata[19]; int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 19 : 18; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ptarget[7] = 0xf; diff --git a/x11/x11.cu b/x11/x11.cu index 2d040d3..2c56892 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -100,7 +100,7 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u const uint32_t first_nonce = pdata[19]; int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ptarget[7] = 0x5; diff --git a/x11/x11evo.cu b/x11/x11evo.cu index f5e161a..9368550 100644 --- a/x11/x11evo.cu +++ b/x11/x11evo.cu @@ -232,7 +232,7 @@ extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce const uint32_t first_nonce = pdata[19]; int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_debug || s_ntime != pdata[17] || s_sequence == -1) { uint32_t ntime = swab32(work->data[17]); diff --git a/x13/x13.cu b/x13/x13.cu index 1093042..cf04d32 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -119,7 +119,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u const uint32_t first_nonce = pdata[19]; int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19; uint32_t throughput = cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x000f; diff --git a/x15/x14.cu b/x15/x14.cu index 06f40e2..c2f6912 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -133,7 +133,7 @@ extern "C" int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, uint32_t endiandata[20]; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ptarget[7] = 0x000f; diff --git a/x15/x15.cu b/x15/x15.cu index 0593c0b..2ad2900 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -143,7 +143,7 @@ extern "C" int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, uint32_t endiandata[20]; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ptarget[7] = 0x00FF; diff --git a/x17/x17.cu b/x17/x17.cu index b20e574..fa2ab31 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -159,7 +159,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u const uint32_t first_nonce = pdata[19]; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x00ff;