From 588c7ba361bf331426fbec673871095b1ee7b1fa Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 14 Jan 2017 14:11:04 +0100 Subject: [PATCH] xmr: dont use shared mem hack, windows dont like --- crypto/cryptolight.h | 1 + crypto/cryptonight-core.cu | 25 ++++++++++++++----------- crypto/cryptonight.cu | 2 +- crypto/cryptonight.h | 3 ++- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h index 9ede001..443cf5b 100644 --- a/crypto/cryptolight.h +++ b/crypto/cryptolight.h @@ -14,6 +14,7 @@ struct uint3 blockDim; #define atomicExch(p,y) (*p) = y #define __funnelshift_r(a,b,c) 1 #define __syncthreads() +#define __threadfence_block() #define asm(x) #define __shfl(a,b,c) 1 #define __umul64hi(a,b) a*b diff --git a/crypto/cryptonight-core.cu b/crypto/cryptonight-core.cu index e91fd84..6d5f721 100644 --- a/crypto/cryptonight-core.cu +++ b/crypto/cryptonight-core.cu @@ -32,12 +32,14 @@ void cryptonight_core_gpu_phase1(const uint32_t threads, uint32_t * long_state, for (uint32_t i = 0; i < 40U; i += 4U) AS_UINT4(&key[i]) = AS_UINT4(&ctx_key[i]); - __threadfence_block(); + __syncthreads(); for(uint32_t i = 0; i < LONG_LOOPS32; i += 32U) { cn_aes_pseudo_round_mut(sharedMemory, (uint32_t*) &text, key); AS_UL2(&long_state[long_oft + i]) = text; } + } else { + __syncthreads(); } } @@ -79,8 +81,8 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, { __shared__ __align__(16) uint32_t sharedMemory[1024]; -// cn_aes_gpu_init(sharedMemory); -// __syncthreads(); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; @@ -125,7 +127,7 @@ void cryptonight_core_gpu_phase3(const uint32_t threads, const uint32_t * __rest { __shared__ __align__(16) uint32_t sharedMemory[1024]; - //cn_aes_gpu_init(sharedMemory); + cn_aes_gpu_init(sharedMemory); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U; const uint32_t sub = (threadIdx.x & 7U) << 2U; @@ -144,7 +146,7 @@ void cryptonight_core_gpu_phase3(const uint32_t threads, const uint32_t * __rest for (uint32_t i = 0; i < 40U; i += 4U) AS_UL2(&key[i]) = AS_UL2(&ctx_key[i]); - //__syncthreads(); + __syncthreads(); for(uint32_t i = 0; i < LONG_LOOPS32; i += 32U) { ulonglong2 st = AS_UL2(&long_state[long_oft + i]); @@ -153,13 +155,15 @@ void cryptonight_core_gpu_phase3(const uint32_t threads, const uint32_t * __rest } AS_UL2(&ctx_state[st_oft]) = text; + } else { + __syncthreads(); } } extern int device_bfactor[MAX_GPUS]; __host__ -void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, +void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) { dim3 grid(blocks); @@ -174,20 +178,19 @@ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_ const int bsleep = bfactor ? 100 : 0; const int dev_id = device_map[thr_id]; - int i; - cryptonight_core_gpu_phase1 <<>> (throughput, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1); + cryptonight_core_gpu_phase1 <<>> (throughput, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); - for(i = 0; i < partcount; i++) + for (uint32_t i = 0; i < partcount; i++) { dim3 b = device_sm[dev_id] >= 300 ? block4 : block; - cryptonight_core_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + cryptonight_core_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); } - cryptonight_core_gpu_phase3 <<>> (throughput, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2); + cryptonight_core_gpu_phase3 <<>> (throughput, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index af6f58b..5d5b1c6 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -92,7 +92,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ cryptonight_extra_cpu_setData(thr_id, pdata, ptarget); cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptonight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); + cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); *hashes_done = nonce - first_nonce + throughput; diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index 7d10f20..b8911be 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -14,6 +14,7 @@ struct uint3 blockDim; #define atomicExch(p,y) (*p) = y #define __funnelshift_r(a,b,c) 1 #define __syncthreads() +#define __threadfence_block() #define asm(x) #define __shfl(a,b,c) 1 #define __umul64hi(a,b) a*b @@ -136,7 +137,7 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) exit(1); } } -void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); +void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);