From 1b31f11252bd1dfdb6f8c5bce0343a07be5788bd Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 9 May 2015 19:25:40 +0200 Subject: [PATCH] neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... --- neoscrypt/cuda_neoscrypt.cu | 43 +++++++++++++++++++++---------------- neoscrypt/cuda_vectors.h | 19 ++++++++-------- neoscrypt/neoscrypt.cpp | 30 ++++++++------------------ 3 files changed, 43 insertions(+), 49 deletions(-) diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu index cd0908f..ab23e55 100644 --- a/neoscrypt/cuda_neoscrypt.cu +++ b/neoscrypt/cuda_neoscrypt.cu @@ -4,9 +4,9 @@ #include "cuda_helper.h" #include "cuda_vectors.h" /* NOT COMPATIBLE WITH SM 3.0 !!! */ - __device__ uint4* W; -uint32_t *d_NNonce[MAX_GPUS]; -uint32_t *d_nnounce[MAX_GPUS]; +static uint32_t *d_buffer[MAX_GPUS]; +static uint32_t *d_NNonce[MAX_GPUS]; +__constant__ uint4* W; __constant__ uint32_t pTarget[8]; __constant__ uint32_t key_init[16]; __constant__ uint32_t input_init[16]; @@ -423,14 +423,14 @@ static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV) #define SHIFT 130 __global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k0(int stratum, uint32_t threads, uint32_t startNonce) +void neoscrypt_gpu_hash_k0(uint32_t threads, uint32_t startNonce, bool stratum) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - uint32_t shift = SHIFT * 16 * thread; // if (thread < threads) { uint32_t data[80]; uint16 X[4]; + uint32_t shift = thread * SHIFT * 16; const uint32_t nonce = startNonce + thread; for (int i = 0; i<20; i++) { @@ -451,10 +451,10 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - uint32_t shift = SHIFT * 16 * thread; // if (thread < threads) { uint16 X[4]; + uint32_t shift = thread * SHIFT * 16; ((uintx64 *)X)[0]= ldg256(&(W + shift)[0]); //#pragma unroll @@ -471,10 +471,10 @@ __global__ __launch_bounds__(128, 1) void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - uint32_t shift = SHIFT * 16 * thread; // if (thread < threads) { uint16 X[4]; + uint32_t shift = thread * SHIFT * 16; ((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]); for (int t = 0; t < 128; t++) @@ -495,7 +495,7 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce) uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); // if (thread < threads) { - uint32_t shift = SHIFT * 16 * thread; + uint32_t shift = thread * SHIFT * 16; uint16 Z[4]; ((uintx64*)Z)[0] = ldg256(&(W + shift)[0]); @@ -510,14 +510,14 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce) } __global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, uint32_t *nonceVector) +void neoscrypt_gpu_hash_k4(uint32_t threads, uint32_t startNonce, uint32_t *nonceRes, bool stratum) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) + if (thread < threads) { const uint32_t nonce = startNonce + thread; - uint32_t shift = SHIFT * 16 * thread; + uint32_t shift = thread * SHIFT * 16; uint16 Z[4]; uint32_t outbuf[8]; uint32_t data[80]; @@ -539,33 +539,37 @@ void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, u ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]); fastkdf32(data, (uint32_t*)Z, outbuf); if (outbuf[7] <= pTarget[7]) { - uint32_t tmp = atomicExch(&nonceVector[0], nonce); + atomicMin(nonceRes, nonce); // init val is UINT32_MAX } } } -void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t *hash) +__host__ +void neoscrypt_cpu_init(int thr_id, uint32_t threads) { - cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(W, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice); + cuda_get_arch(thr_id); cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], threads * 256 * SHIFT)); + cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice); } __host__ -uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order) +uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, bool have_stratum, int order) { - uint32_t result[MAX_GPUS] = { 0xffffffff }; + uint32_t result[MAX_GPUS]; + memset(result, 0xff, sizeof(result)); cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t)); const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - neoscrypt_gpu_hash_k0 <<< grid, block >>>(stratum, threads, startNounce); + neoscrypt_gpu_hash_k0 <<< grid, block >>>(threads, startNounce, have_stratum); neoscrypt_gpu_hash_k01 <<< grid, block >>>(threads, startNounce); neoscrypt_gpu_hash_k2 <<< grid, block >>>(threads, startNounce); neoscrypt_gpu_hash_k3 <<< grid, block >>>(threads, startNounce); - neoscrypt_gpu_hash_k4 <<< grid, block >>>(stratum, threads, startNounce, d_NNonce[thr_id]); + neoscrypt_gpu_hash_k4 <<< grid, block >>>(threads, startNounce, d_NNonce[thr_id], have_stratum); MyStreamSynchronize(NULL, order, thr_id); cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); @@ -578,6 +582,7 @@ void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target) { unsigned char PaddedMessage[80*4]; //bring balance to the force uint32_t input[16], key[16] = { 0 }; + memcpy(PaddedMessage, pdata, 80); memcpy(PaddedMessage + 80, pdata, 80); memcpy(PaddedMessage + 160, pdata, 80); diff --git a/neoscrypt/cuda_vectors.h b/neoscrypt/cuda_vectors.h index 67f77d9..08fc0ee 100644 --- a/neoscrypt/cuda_vectors.h +++ b/neoscrypt/cuda_vectors.h @@ -478,23 +478,25 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift #if __CUDA_ARCH__ < 320 -// right shift a 64 bytes input (256-bits integer) by 0 8 16 24 bits -static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) +// right shift a 64-bytes integer (256-bits) by 0 8 16 24 bits +// require a uint32_t[9] ret array +// note: djm neoscrypt implementation is near the limits of gpu capabilities +// and weird behaviors can happen when tuning device functions code... +__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) { uint8_t *v = (uint8_t*) &vec4.s0; uint8_t *r = (uint8_t*) ret; uint8_t bytes = (uint8_t) (shift >> 3); - for (uint8_t i=0; i> (32 - shift); // shuffled part required ? - //printf("A %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]); + ret[8] = vec4.s7 >> (32 - shift); // shuffled part required } + #else -// right shift a 32 bytes input (256-bits integer) by 0 8 16 24 bits -static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) +// same for SM 3.5+, really faster ? +__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) { uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); @@ -522,7 +524,6 @@ static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec ret[1] = cuda_swab32(truc); asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); ret[0] = cuda_swab32(truc); - //printf("B %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]); } #endif diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp index f1d3290..9238e77 100644 --- a/neoscrypt/neoscrypt.cpp +++ b/neoscrypt/neoscrypt.cpp @@ -2,13 +2,9 @@ #include "miner.h" #include "neoscrypt/neoscrypt.h" -static uint32_t *d_hash[MAX_GPUS] ; extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget); -extern void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t* hash); -extern uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order); -extern int cuda_get_arch(int thr_id); - -#define SHIFT 130 +extern void neoscrypt_cpu_init(int thr_id, uint32_t threads); +extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, bool have_stratum, int order); static bool init[MAX_GPUS] = { 0 }; @@ -16,43 +12,35 @@ int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uin { const uint32_t first_nonce = pdata[19]; - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - int intensity = is_windows() ? 18 : 19; uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity); throughput = throughput / 32; /* set for max intensity ~= 20 */ throughput = min(throughput, max_nonce - first_nonce + 1); + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x0000ff; + if (!init[thr_id]) { int dev_id = device_map[thr_id]; cudaSetDevice(dev_id); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - cuda_get_arch(thr_id); if (device_sm[dev_id] <= 300) { applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices"); proper_exit(EXIT_CODE_CUDA_ERROR); } - cudaMalloc(&d_hash[thr_id], 32 * SHIFT * sizeof(uint64_t) * throughput); - neoscrypt_cpu_init(thr_id, throughput, d_hash[thr_id]); - applog(LOG_INFO, "Using %d cuda threads", throughput); - if (cudaGetLastError() != cudaSuccess) { - cudaError_t err = cudaGetLastError(); - fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", - __FUNCTION__, __LINE__, cudaGetErrorString(err) ); - proper_exit(EXIT_FAILURE); - } + neoscrypt_cpu_init(thr_id, throughput); + init[thr_id] = true; } uint32_t endiandata[20]; if (have_stratum) { for (int k = 0; k < 20; k++) - be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); + be32enc(&endiandata[k], pdata[k]); } else { for (int k = 0; k < 20; k++) endiandata[k] = pdata[k]; @@ -61,7 +49,7 @@ int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uin neoscrypt_setBlockTarget(endiandata,ptarget); do { - uint32_t foundNonce = neoscrypt_cpu_hash_k4((int)have_stratum, thr_id, throughput, pdata[19], 0); + uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0); if (foundNonce != UINT32_MAX) { uint32_t _ALIGN(64) vhash64[8];