From e74d5884b41cf0b47c96079430645e1d9c2adc95 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 21 Apr 2015 16:56:22 +0200 Subject: [PATCH] scrypt: final cleanup for 1.6.2 release --- scrypt-jane.cpp | 75 +++++++++++++++++------------------ scrypt/blake.cu | 3 +- scrypt/keccak.cu | 8 ++-- scrypt/kepler_kernel.cu | 87 +++++++++++++++++++++-------------------- scrypt/nv_kernel.cu | 9 +++-- scrypt/nv_kernel2.cu | 9 +++-- scrypt/salsa_kernel.h | 2 - scrypt/sha256.cu | 3 +- scrypt/test_kernel.cu | 3 +- scrypt/titan_kernel.cu | 3 +- 10 files changed, 106 insertions(+), 96 deletions(-) diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp index e665182..c6cc79d 100644 --- a/scrypt-jane.cpp +++ b/scrypt-jane.cpp @@ -1,8 +1,10 @@ /* - scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane - - Public Domain or MIT License, whichever is easier -*/ + * scrypt-jane by Andrew M, https://github.com/floodyberry/scrypt-jane + * + * Public Domain or MIT License, whichever is easier + * + * Adapted to ccminer by tpruvot@github (2015) + */ #include "miner.h" @@ -50,8 +52,8 @@ static const uint64_t keccak_round_constants[24] = { 0x0000000080000001ull, 0x8000000080008008ull }; -static void -keccak_block(scrypt_hash_state *S, const uint8_t *in) { +static void keccak_block(scrypt_hash_state *S, const uint8_t *in) +{ size_t i; uint64_t *s = S->state, t[5], u[5], v, w; @@ -120,13 +122,12 @@ keccak_block(scrypt_hash_state *S, const uint8_t *in) { } } -static void -scrypt_hash_init(scrypt_hash_state *S) { +static void scrypt_hash_init(scrypt_hash_state *S) { memset(S, 0, sizeof(*S)); } -static void -scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { +static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) +{ size_t want; /* handle the previous data */ @@ -155,8 +156,8 @@ scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { memcpy(S->buffer, in, S->leftover); } -static void -scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { +static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) +{ size_t i; S->buffer[S->leftover] = 0x01; @@ -178,17 +179,18 @@ typedef struct scrypt_hmac_state_t { } scrypt_hmac_state; -static void -scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { +static void scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) +{ scrypt_hash_state st; + scrypt_hash_init(&st); scrypt_hash_update(&st, m, mlen); scrypt_hash_finish(&st, hash); } /* hmac */ -static void -scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { +static void scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) +{ uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; size_t i; @@ -216,14 +218,14 @@ scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); } -static void -scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { +static void scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) +{ /* h(inner || m...) */ scrypt_hash_update(&st->inner, m, mlen); } -static void -scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { +static void scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) +{ /* h(inner || m) */ scrypt_hash_digest innerhash; scrypt_hash_finish(&st->inner, innerhash); @@ -237,8 +239,9 @@ scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { * Special version where N = 1 * - mikaelh */ -static void -scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) { +static void scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, + const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes) +{ scrypt_hmac_state hmac_pw, hmac_pw_salt, work; scrypt_hash_digest ti, u; uint8_t be[4]; @@ -271,16 +274,14 @@ scrypt_pbkdf2_1(const uint8_t *password, size_t password_len, const uint8_t *sal // ---------------------------- END PBKDF2 functions ------------------------------------ -static void -scrypt_fatal_error_default(const char *msg) { +static void scrypt_fatal_error_default(const char *msg) { fprintf(stderr, "%s\n", msg); exit(1); } static scrypt_fatal_errorfn scrypt_fatal_error = scrypt_fatal_error_default; -void -scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { +void scrypt_set_fatal_error_default(scrypt_fatal_errorfn fn) { scrypt_fatal_error = fn; } @@ -293,8 +294,8 @@ static uint8_t *mem_base = (uint8_t *)0; static size_t mem_bump = 0; /* allocations are assumed to be multiples of 64 bytes and total allocations not to exceed ~1.01gb */ -static scrypt_aligned_alloc -scrypt_alloc(uint64_t size) { +static scrypt_aligned_alloc scrypt_alloc(uint64_t size) +{ scrypt_aligned_alloc aa; if (!mem_base) { mem_base = (uint8_t *)malloc((1024 * 1024 * 1024) + (1024 * 1024) + (SCRYPT_BLOCK_BYTES - 1)); @@ -308,13 +309,13 @@ scrypt_alloc(uint64_t size) { return aa; } -static void -scrypt_free(scrypt_aligned_alloc *aa) { +static void scrypt_free(scrypt_aligned_alloc *aa) +{ mem_bump = 0; } #else -static scrypt_aligned_alloc -scrypt_alloc(uint64_t size) { +static scrypt_aligned_alloc scrypt_alloc(uint64_t size) +{ static const size_t max_alloc = (size_t)-1; scrypt_aligned_alloc aa; size += (SCRYPT_BLOCK_BYTES - 1); @@ -327,15 +328,16 @@ scrypt_alloc(uint64_t size) { return aa; } -static void -scrypt_free(scrypt_aligned_alloc *aa) { +static void scrypt_free(scrypt_aligned_alloc *aa) +{ free(aa->mem); } #endif // yacoin: increasing Nfactor gradually -unsigned char GetNfactor(unsigned int nTimestamp) { +unsigned char GetNfactor(unsigned int nTimestamp) +{ int l = 0; unsigned int Nfactor = 0; @@ -427,14 +429,13 @@ unsigned char GetNfactor(unsigned int nTimestamp) { #define bswap_32x4(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) - static int s_Nfactor = 0; int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, unsigned char *scratchbuf, uint32_t max_nonce, unsigned long *hashes_done, struct timeval *tv_start, struct timeval *tv_end) { const uint32_t Htarg = ptarget[7]; - uint64_t N; + uint32_t N; if (s_Nfactor == 0 && strlen(jane_params) > 0) applog(LOG_INFO, "Given scrypt-jane parameters: %s", jane_params); diff --git a/scrypt/blake.cu b/scrypt/blake.cu index bcaa965..6dde6e8 100644 --- a/scrypt/blake.cu +++ b/scrypt/blake.cu @@ -414,11 +414,12 @@ __global__ void cuda_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g } } -static bool init[MAX_GPUS] = { 0 }; static std::map context_good[2]; bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) { + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) { // allocate pinned host memory for good hashes diff --git a/scrypt/keccak.cu b/scrypt/keccak.cu index aa23e50..018a969 100644 --- a/scrypt/keccak.cu +++ b/scrypt/keccak.cu @@ -459,10 +459,10 @@ void cuda_post_keccak512(uint32_t *g_odata, uint32_t *g_hash, uint32_t nonce) // callable host code to initialize constants and to call kernels // -static bool init[MAX_GPUS] = { 0 }; - extern "C" void prepare_keccak512(int thr_id, const uint32_t host_pdata[20]) { + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) { checkCudaErrors(cudaMemcpyToSymbol(c_keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice)); @@ -796,10 +796,10 @@ void crypto_hash(uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validat static std::map context_good[2]; -// ... keccak??? bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) { - static bool init[MAX_DEVICES] = {false}; + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) { checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); diff --git a/scrypt/kepler_kernel.cu b/scrypt/kepler_kernel.cu index 45b94ee..41e3256 100644 --- a/scrypt/kepler_kernel.cu +++ b/scrypt/kepler_kernel.cu @@ -661,43 +661,43 @@ KeplerKernel::KeplerKernel() : KernelInterface() bool KeplerKernel::bindtexture_1D(uint32_t *d_V, size_t size) { - cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); - texRef1D_4_V.normalized = 0; - texRef1D_4_V.filterMode = cudaFilterModePoint; - texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; - checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); - return true; + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef1D_4_V.normalized = 0; + texRef1D_4_V.filterMode = cudaFilterModePoint; + texRef1D_4_V.addressMode[0] = cudaAddressModeClamp; + checkCudaErrors(cudaBindTexture(NULL, &texRef1D_4_V, d_V, &channelDesc4, size)); + return true; } bool KeplerKernel::bindtexture_2D(uint32_t *d_V, int width, int height, size_t pitch) { - cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); - texRef2D_4_V.normalized = 0; - texRef2D_4_V.filterMode = cudaFilterModePoint; - texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; - texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; - // maintain texture width of TEXWIDTH (max. limit is 65000) - while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } - while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } - checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); - return true; + cudaChannelFormatDesc channelDesc4 = cudaCreateChannelDesc(); + texRef2D_4_V.normalized = 0; + texRef2D_4_V.filterMode = cudaFilterModePoint; + texRef2D_4_V.addressMode[0] = cudaAddressModeClamp; + texRef2D_4_V.addressMode[1] = cudaAddressModeClamp; + // maintain texture width of TEXWIDTH (max. limit is 65000) + while (width > TEXWIDTH) { width /= 2; height *= 2; pitch /= 2; } + while (width < TEXWIDTH) { width *= 2; height = (height+1)/2; pitch *= 2; } + checkCudaErrors(cudaBindTexture2D(NULL, &texRef2D_4_V, d_V, &channelDesc4, width, height, pitch)); + return true; } bool KeplerKernel::unbindtexture_1D() { - checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); - return true; + checkCudaErrors(cudaUnbindTexture(texRef1D_4_V)); + return true; } bool KeplerKernel::unbindtexture_2D() { - checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); - return true; + checkCudaErrors(cudaUnbindTexture(texRef2D_4_V)); + return true; } void KeplerKernel::set_scratchbuf_constants(int MAXWARPS, uint32_t** h_V) { - checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpyToSymbol(c_V, h_V, MAXWARPS*sizeof(uint32_t*), 0, cudaMemcpyHostToDevice)); } bool KeplerKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_id, cudaStream_t stream, @@ -706,21 +706,22 @@ bool KeplerKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int bool success = true; // make some constants available to kernel, update only initially and when changing - static int prev_N[MAX_DEVICES] = {0}; + static uint32_t prev_N[MAX_GPUS] = { 0 }; + if (N != prev_N[thr_id]) { - uint32_t h_N = N; - uint32_t h_N_1 = N-1; - uint32_t h_SCRATCH = SCRATCH; - uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP); - uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1; - - cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); - cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); - cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); - cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); - cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); - - prev_N[thr_id] = N; + uint32_t h_N = N; + uint32_t h_N_1 = N-1; + uint32_t h_SCRATCH = SCRATCH; + uint32_t h_SCRATCH_WU_PER_WARP = (SCRATCH * WU_PER_WARP); + uint32_t h_SCRATCH_WU_PER_WARP_1 = (SCRATCH * WU_PER_WARP) - 1; + + cudaMemcpyToSymbolAsync(c_N, &h_N, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_N_1, &h_N_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH, &h_SCRATCH, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP, &h_SCRATCH_WU_PER_WARP, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + cudaMemcpyToSymbolAsync(c_SCRATCH_WU_PER_WARP_1, &h_SCRATCH_WU_PER_WARP_1, sizeof(uint32_t), 0, cudaMemcpyHostToDevice, stream); + + prev_N[thr_id] = N; } // First phase: Sequential writes to scratchpad. @@ -732,14 +733,14 @@ bool KeplerKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int unsigned int pos = 0; do { - if (LOOKUP_GAP == 1) { - if (IS_SCRYPT()) kepler_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); - if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); - } else { - if (IS_SCRYPT()) kepler_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); - if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); - } - pos += batch; + if (LOOKUP_GAP == 1) { + if (IS_SCRYPT()) kepler_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N)); + } else { + if (IS_SCRYPT()) kepler_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + if (IS_SCRYPT_JANE()) kepler_scrypt_core_kernelA_LG <<< grid, threads, 0, stream >>>(d_idata, pos, min(pos+batch, N), LOOKUP_GAP); + } + pos += batch; } while (pos < N); // Second phase: Random read access from scratchpad. diff --git a/scrypt/nv_kernel.cu b/scrypt/nv_kernel.cu index 28a2708..b84a557 100644 --- a/scrypt/nv_kernel.cu +++ b/scrypt/nv_kernel.cu @@ -97,7 +97,8 @@ bool NVKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr_ bool success = true; // make some constants available to kernel, update only initially and when changing - static int prev_N[MAX_DEVICES] = {0}; + static uint32_t prev_N[MAX_GPUS] = { 0 }; + if (N != prev_N[thr_id]) { uint32_t h_N = N; uint32_t h_N_1 = N-1; @@ -1025,7 +1026,8 @@ static std::map context_good[2]; bool NVKernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) { - static bool init[MAX_DEVICES] = {false}; + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) { checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); @@ -1452,7 +1454,8 @@ void kepler_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bo bool NVKernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) { - static bool init[MAX_DEVICES] = {false}; + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) { // allocate pinned host memory for good hashes diff --git a/scrypt/nv_kernel2.cu b/scrypt/nv_kernel2.cu index cc01843..f832dbf 100644 --- a/scrypt/nv_kernel2.cu +++ b/scrypt/nv_kernel2.cu @@ -56,7 +56,8 @@ bool NV2Kernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int thr bool success = true; // make some constants available to kernel, update only initially and when changing - static int prev_N[MAX_DEVICES] = {0}; + static uint32_t prev_N[MAX_GPUS] = { 0 }; + if (N != prev_N[thr_id]) { uint32_t h_N = N; uint32_t h_N_1 = N-1; @@ -1264,7 +1265,8 @@ static std::map context_good[2]; bool NV2Kernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) { - static bool init[MAX_DEVICES] = {false}; + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) { checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice)); @@ -1687,7 +1689,8 @@ __global__ void titan_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t * bool NV2Kernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]) { - static bool init[MAX_DEVICES] = {false}; + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) { // allocate pinned host memory for good hashes diff --git a/scrypt/salsa_kernel.h b/scrypt/salsa_kernel.h index 11011e4..ccb60ab 100644 --- a/scrypt/salsa_kernel.h +++ b/scrypt/salsa_kernel.h @@ -9,8 +9,6 @@ #include "miner.h" -#define MAX_DEVICES MAX_GPUS - // from ccminer.cpp extern short device_map[MAX_GPUS]; extern int device_batchsize[MAX_GPUS]; // cudaminer -b diff --git a/scrypt/sha256.cu b/scrypt/sha256.cu index 5b4c808..7d4c2e5 100644 --- a/scrypt/sha256.cu +++ b/scrypt/sha256.cu @@ -409,7 +409,8 @@ __global__ void cuda_post_sha256(uint32_t g_output[8], uint32_t g_tstate_ext[8], void prepare_sha256(int thr_id, uint32_t host_pdata[20], uint32_t host_midstate[8]) { - static bool init[8] = {false, false, false, false, false, false, false, false}; + static bool init[MAX_GPUS] = { 0 }; + if (!init[thr_id]) { checkCudaErrors(cudaMemcpyToSymbol(sha256_h, host_sha256_h, sizeof(host_sha256_h), 0, cudaMemcpyHostToDevice)); diff --git a/scrypt/test_kernel.cu b/scrypt/test_kernel.cu index f7552d3..f3d2df0 100644 --- a/scrypt/test_kernel.cu +++ b/scrypt/test_kernel.cu @@ -710,7 +710,8 @@ bool TestKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int th size_t shared = ((threads.x + 31) / 32) * (32+1) * sizeof(uint32_t); // make some constants available to kernel, update only initially and when changing - static int prev_N[MAX_DEVICES] = {0}; + static uint32_t prev_N[MAX_GPUS] = { 0 }; + if (N != prev_N[thr_id]) { uint32_t h_N = N; uint32_t h_N_1 = N-1; diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu index fe12ea6..197aabd 100644 --- a/scrypt/titan_kernel.cu +++ b/scrypt/titan_kernel.cu @@ -678,7 +678,8 @@ bool TitanKernel::run_kernel(dim3 grid, dim3 threads, int WARPS_PER_BLOCK, int t bool success = true; // make some constants available to kernel, update only initially and when changing - static int prev_N[MAX_DEVICES] = {0}; + static uint32_t prev_N[MAX_GPUS] = { 0 }; + if (N != prev_N[thr_id]) { uint32_t h_N = N; uint32_t h_N_1 = N-1;