From b1bddb54d6f2a540200738c2a3539717164ae3ca Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 17 May 2015 20:49:14 +0200 Subject: [PATCH] scrypt: reduce cpu usage (flush/sync) and do some minimal keccak changes (for jane) --- scrypt-jane.cpp | 72 ++++++++++----------- scrypt.cpp | 17 +++-- scrypt/keccak.cu | 141 +++++++++++++++++++++++------------------ scrypt/nv_kernel2.cu | 18 +++--- scrypt/salsa_kernel.cu | 28 ++++---- scrypt/titan_kernel.cu | 18 +++--- 6 files changed, 160 insertions(+), 134 deletions(-) diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp index a54acab..74ffda9 100644 --- a/scrypt-jane.cpp +++ b/scrypt-jane.cpp @@ -514,10 +514,10 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u cuda_scrypt_done(thr_id, nxt); cuda_scrypt_DtoH(thr_id, cuda_X[nxt], nxt, false); - cuda_scrypt_flush(thr_id, nxt); - if(!cuda_scrypt_sync(thr_id, cur)) { - return -1; + //cuda_scrypt_flush(thr_id, nxt); + if(!cuda_scrypt_sync(thr_id, nxt)) { + break; } memcpy(Xbuf[cur].ptr, cuda_X[cur], 128 * throughput); @@ -562,51 +562,49 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u cuda_scrypt_serialize(thr_id, nxt); pre_keccak512(thr_id, nxt, nonce[nxt], throughput); cuda_scrypt_core(thr_id, nxt, N); - cuda_scrypt_flush(thr_id, nxt); // required + //cuda_scrypt_flush(thr_id, nxt); + if (!cuda_scrypt_sync(thr_id, nxt)) { + break; + } post_keccak512(thr_id, nxt, nonce[nxt], throughput); cuda_scrypt_done(thr_id, nxt); cuda_scrypt_DtoH(thr_id, hash[nxt], nxt, true); - cuda_scrypt_flush(thr_id, nxt); // seems required here - - if (!cuda_scrypt_sync(thr_id, cur)) { - return -1; + //cuda_scrypt_flush(thr_id, nxt); // made by cuda_scrypt_sync + if (!cuda_scrypt_sync(thr_id, nxt)) { + break; } } - if(iteration > 0) + for (int i=0; iteration > 0 && istate, t[5], u[5], v, w; /* absorb input */ #pragma unroll 9 - for (i = 0; i < 72 / 8; i++, in += 2) + for (int i = 0; i < 72 / 8; i++, in += 2) s[i] ^= U32TO64_LE(in); - for (i = 0; i < 24; i++) { + for (int i = 0; i < 24; i++) { /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; @@ -186,8 +186,9 @@ void keccak_block(keccak_hash_state *S, const uint32_t *in) { } __device__ -void keccak_hash_init(keccak_hash_state *S) { -#pragma unroll 25 +void keccak_hash_init(keccak_hash_state *S) +{ + #pragma unroll 25 for (int i=0; i<25; ++i) S->state[i] = 0ULL; } @@ -218,50 +219,62 @@ __device__ void keccak_hash_update64(keccak_hash_state *S, const uint32_t *in) { mycpy64(S->buffer, in); } -__device__ void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash) { +__device__ +void keccak_hash_finish8(keccak_hash_state *S, uint32_t *hash) +{ S->buffer[8/4] = 0x01; -#pragma unroll 15 + #pragma unroll 15 for (int i=8/4+1; i < 72/4; ++i) S->buffer[i] = 0; - S->buffer[72/4 - 1] |= 0x80000000; + S->buffer[72/4 - 1] |= 0x80000000U; keccak_block(S, (const uint32_t*)S->buffer); -#pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { + + #pragma unroll 8 + for (int i = 0; i < 64; i += 8) { U64TO32_LE((&hash[i/4]), S->state[i / 8]); } } -__device__ void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash) { +__device__ +void keccak_hash_finish12(keccak_hash_state *S, uint32_t *hash) +{ S->buffer[12/4] = 0x01; -#pragma unroll 14 + #pragma unroll 14 for (int i=12/4+1; i < 72/4; ++i) S->buffer[i] = 0; - S->buffer[72/4 - 1] |= 0x80000000; + S->buffer[72/4 - 1] |= 0x80000000U; keccak_block(S, (const uint32_t*)S->buffer); -#pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { + + #pragma unroll 8 + for (int i = 0; i < 64; i += 8) { U64TO32_LE((&hash[i/4]), S->state[i / 8]); } } -__device__ void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash) { +__device__ +void keccak_hash_finish60(keccak_hash_state *S, uint32_t *hash) +{ S->buffer[60/4] = 0x01; -#pragma unroll 2 + #pragma unroll for (int i=60/4+1; i < 72/4; ++i) S->buffer[i] = 0; - S->buffer[72/4 - 1] |= 0x80000000; + S->buffer[72/4 - 1] |= 0x80000000U; keccak_block(S, (const uint32_t*)S->buffer); -#pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { + + #pragma unroll 8 + for (int i = 0; i < 64; i += 8) { U64TO32_LE((&hash[i/4]), S->state[i / 8]); } } -__device__ void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash) { +__device__ +void keccak_hash_finish64(keccak_hash_state *S, uint32_t *hash) +{ S->buffer[64/4] = 0x01; -#pragma unroll 1 + #pragma unroll for (int i=64/4+1; i < 72/4; ++i) S->buffer[i] = 0; - S->buffer[72/4 - 1] |= 0x80000000; + S->buffer[72/4 - 1] |= 0x80000000U; keccak_block(S, (const uint32_t*)S->buffer); -#pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { + + #pragma unroll 8 + for (int i = 0; i < 64; i += 8) { U64TO32_LE((&hash[i/4]), S->state[i / 8]); } } @@ -275,7 +288,8 @@ typedef struct pbkdf2_hmac_state_t { } pbkdf2_hmac_state; -__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) { +__device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) +{ keccak_hash_state st; keccak_hash_init(&st); keccak_hash_update72(&st, m); @@ -284,32 +298,32 @@ __device__ void pbkdf2_hash(uint32_t *hash, const uint32_t *m) { } /* hmac */ -__device__ void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key) { - uint32_t pad[72/4]; - size_t i; +__device__ +void pbkdf2_hmac_init80(pbkdf2_hmac_state *st, const uint32_t *key) +{ + uint32_t pad[72/4] = { 0 }; + //#pragma unroll 18 + //for (int i = 0; i < 72/4; i++) + // pad[i] = 0; keccak_hash_init(&st->inner); keccak_hash_init(&st->outer); -#pragma unroll 18 - for (i = 0; i < 72/4; i++) - pad[i] = 0; - /* key > blocksize bytes, hash it */ pbkdf2_hash(pad, key); /* inner = (key ^ 0x36) */ /* h(inner || ...) */ -#pragma unroll 18 - for (i = 0; i < 72/4; i++) - pad[i] ^= 0x36363636; + #pragma unroll 18 + for (int i = 0; i < 72/4; i++) + pad[i] ^= 0x36363636U; keccak_hash_update72(&st->inner, pad); /* outer = (key ^ 0x5c) */ /* h(outer || ...) */ -#pragma unroll 18 - for (i = 0; i < 72/4; i++) - pad[i] ^= 0x6a6a6a6a; + #pragma unroll 18 + for (int i = 0; i < 72/4; i++) + pad[i] ^= 0x6a6a6a6aU; keccak_hash_update72(&st->outer, pad); } @@ -370,21 +384,20 @@ __device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) { __global__ __launch_bounds__(128) void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce) { - nonce += (blockIdx.x * blockDim.x) + threadIdx.x; - g_idata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); - uint32_t data[20]; + const uint32_t thread = (blockIdx.x * blockDim.x) + threadIdx.x; + nonce += thread; + g_idata += thread * 32; + #pragma unroll - for (int i=0; i <19; ++i) - data[i] = cuda_swab32(pdata[i]); + for (int i=0; i<19; i++) + data[i] = cuda_swab32(c_data[i]); data[19] = cuda_swab32(nonce); // scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)data, 80, (uint8_t*)g_idata, 128); - pbkdf2_hmac_state hmac_pw, work; - uint32_t ti[16]; - uint32_t be; + pbkdf2_hmac_state hmac_pw; /* hmac(password, ...) */ pbkdf2_hmac_init80(&hmac_pw, data); @@ -393,14 +406,17 @@ void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce) pbkdf2_hmac_update72(&hmac_pw, data); pbkdf2_hmac_update8(&hmac_pw, data+72/4); + pbkdf2_hmac_state work; + uint32_t ti[16]; + /* U1 = hmac(password, salt || be(i)) */ - be = cuda_swab32(1); + uint32_t be = 0x01000000U;//cuda_swab32(1); pbkdf2_statecopy8(&work, &hmac_pw); pbkdf2_hmac_update4_8(&work, &be); pbkdf2_hmac_finish12(&work, ti); mycpy64(g_idata, ti); - be = cuda_swab32(2); + be = 0x02000000U;//cuda_swab32(2); pbkdf2_statecopy8(&work, &hmac_pw); pbkdf2_hmac_update4_8(&work, &be); pbkdf2_hmac_finish12(&work, ti); @@ -411,22 +427,21 @@ void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce) __global__ __launch_bounds__(128) void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce) { - nonce += (blockIdx.x * blockDim.x) + threadIdx.x; - g_odata += 32 * ((blockIdx.x * blockDim.x) + threadIdx.x); - g_hash += 8 * ((blockIdx.x * blockDim.x) + threadIdx.x); - uint32_t data[20]; -#pragma unroll 19 - for (int i=0; i <19; ++i) - data[i] = cuda_swab32(pdata[i]); + const uint32_t thread = (blockIdx.x * blockDim.x) + threadIdx.x; + g_hash += thread * 8; + g_odata += thread * 32; + nonce += thread; + + #pragma unroll + for (int i=0; i<19; i++) + data[i] = cuda_swab32(c_data[i]); data[19] = cuda_swab32(nonce); // scrypt_pbkdf2_1((const uint8_t*)data, 80, (const uint8_t*)g_odata, 128, (uint8_t*)g_hash, 32); pbkdf2_hmac_state hmac_pw; - uint32_t ti[16]; - uint32_t be; /* hmac(password, ...) */ pbkdf2_hmac_init80(&hmac_pw, data); @@ -435,8 +450,10 @@ void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce) pbkdf2_hmac_update72(&hmac_pw, g_odata); pbkdf2_hmac_update56(&hmac_pw, g_odata+72/4); + uint32_t ti[16]; + /* U1 = hmac(password, salt || be(i)) */ - be = cuda_swab32(1); + uint32_t be = 0x01000000U;//cuda_swab32(1); pbkdf2_hmac_update4_56(&hmac_pw, &be); pbkdf2_hmac_finish60(&hmac_pw, ti); mycpy32(g_hash, ti); @@ -455,7 +472,7 @@ extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]) checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice)); init[thr_id] = true; } - checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(c_data, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); } extern "C" void pre_keccak512(int thr_id, int stream, uint32_t nonce, int throughput) diff --git a/scrypt/nv_kernel2.cu b/scrypt/nv_kernel2.cu index cbb8d01..d875aba 100644 --- a/scrypt/nv_kernel2.cu +++ b/scrypt/nv_kernel2.cu @@ -54,6 +54,8 @@ void NV2Kernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) { bool success = true; + bool scrypt = IS_SCRYPT(); + bool chacha = IS_SCRYPT_JANE(); // make some constants available to kernel, update only initially and when changing static uint32_t prev_N[MAX_GPUS] = { 0 }; @@ -77,11 +79,11 @@ bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr do { if (LOOKUP_GAP == 1) { - if (IS_SCRYPT()) nv2_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); - if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelA<<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (scrypt) nv2_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (chacha) nv2_scrypt_core_kernelA<<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); } else { - if (IS_SCRYPT()) nv2_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); - if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelA_LG<<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (scrypt) nv2_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (chacha) nv2_scrypt_core_kernelA_LG<<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); } pos += batch; } while (pos < N); @@ -91,11 +93,11 @@ bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr do { if (LOOKUP_GAP == 1) { - if (IS_SCRYPT()) nv2_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); - if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (scrypt) nv2_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (chacha) nv2_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); } else { - if (IS_SCRYPT()) nv2_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); - if (IS_SCRYPT_JANE()) nv2_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (scrypt) nv2_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (chacha) nv2_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); } pos += batch; diff --git a/scrypt/salsa_kernel.cu b/scrypt/salsa_kernel.cu index 9b179d9..a22a863 100644 --- a/scrypt/salsa_kernel.cu +++ b/scrypt/salsa_kernel.cu @@ -819,8 +819,9 @@ void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA) bool cuda_scrypt_sync(int thr_id, int stream) { cudaError_t err; + uint32_t wait_us = 0; - if(device_interactive[thr_id] && !opt_benchmark) + if (device_interactive[thr_id] && !opt_benchmark) { // For devices that also do desktop rendering or compositing, we want to free up some time slots. // That requires making a pause in work submission when there is no active task on the GPU, @@ -830,27 +831,30 @@ bool cuda_scrypt_sync(int thr_id, int stream) //err = cudaDeviceSynchronize(); while((err = cudaStreamQuery(context_streams[0][thr_id])) == cudaErrorNotReady || - (err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady)) - usleep(1000); + (err == cudaSuccess && (err = cudaStreamQuery(context_streams[1][thr_id])) == cudaErrorNotReady)) { + usleep(50); wait_us+=50; + } - usleep(1000); - } - else - { + usleep(50); wait_us+=50; + } else { // this call was replaced by the loop below to workaround the high CPU usage issue //err = cudaStreamSynchronize(context_streams[stream][thr_id]); - while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady) - usleep(1000); + while((err = cudaStreamQuery(context_streams[stream][thr_id])) == cudaErrorNotReady) { + usleep(50); wait_us+=50; + } } - if(err != cudaSuccess) - { + if (err != cudaSuccess) { if (!abort_flag) - applog(LOG_ERR, "GPU #%d: CUDA error `%s` while executing the kernel.", device_map[thr_id], cudaGetErrorString(err)); + applog(LOG_ERR, "GPU #%d: CUDA error `%s` while waiting the kernel.", device_map[thr_id], cudaGetErrorString(err)); return false; } + //if (opt_debug) { + // applog(LOG_DEBUG, "GPU #%d: %s %u us", device_map[thr_id], __FUNCTION__, wait_us); + //} + return true; } diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu index 8ed901d..a172958 100644 --- a/scrypt/titan_kernel.cu +++ b/scrypt/titan_kernel.cu @@ -676,6 +676,8 @@ bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int t uint32_t* d_idata, uint32_t* d_odata, unsigned int N, unsigned int LOOKUP_GAP, bool interactive, bool benchmark, int texture_cache) { bool success = true; + bool scrypt = IS_SCRYPT(); + bool chacha = IS_SCRYPT_JANE(); // make some constants available to kernel, update only initially and when changing static uint32_t prev_N[MAX_GPUS] = { 0 }; @@ -703,11 +705,11 @@ bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int t unsigned int pos = 0; do { if (LOOKUP_GAP == 1) { - if (IS_SCRYPT()) titan_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); - if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (scrypt) titan_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (chacha) titan_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); } else { - if (IS_SCRYPT()) titan_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); - if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (scrypt) titan_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (chacha) titan_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); } pos += batch; @@ -718,11 +720,11 @@ bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int t pos = 0; do { if (LOOKUP_GAP == 1) { - if (IS_SCRYPT()) titan_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); - if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (scrypt) titan_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); + if (chacha) titan_scrypt_core_kernelB <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N)); } else { - if (IS_SCRYPT()) titan_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); - if (IS_SCRYPT_JANE()) titan_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (scrypt) titan_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); + if (chacha) titan_scrypt_core_kernelB_LG <<< grid, threads, 0, stream >>>(d_odata, pos, min(pos+batch, N), LOOKUP_GAP); } pos += batch;