From 5a69056ee5a566a1004357011f1defdd9bb11136 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 13 Mar 2016 19:12:20 +0100 Subject: [PATCH] blake2s cleanup --- Algo256/blake2s.cu | 80 ++++++++++++++++++++-------------------------- README.txt | 5 ++- ccminer.cpp | 5 ++- res/ccminer.rc | 8 ++--- 4 files changed, 45 insertions(+), 53 deletions(-) diff --git a/Algo256/blake2s.cu b/Algo256/blake2s.cu index 4851abb..ca0f8f2 100644 --- a/Algo256/blake2s.cu +++ b/Algo256/blake2s.cu @@ -1,3 +1,7 @@ +/** + * Blake2-S 256 CUDA implementation + * @author tpruvot@github March 2016 + */ #include #include #include @@ -5,19 +9,18 @@ #include "miner.h" -#define NATIVE_LITTLE_ENDIAN - extern "C" { +#define NATIVE_LITTLE_ENDIAN #include } -static __thread blake2s_state ALIGN(64) s_midstate; -static __thread blake2s_state ALIGN(64) s_ctx; - //#define GPU_MIDSTATE #define MIDLEN 76 #define A 64 +static __thread blake2s_state ALIGN(A) s_midstate; +static __thread blake2s_state ALIGN(A) s_ctx; + #include "cuda_helper.h" #ifdef __INTELLISENSE__ @@ -63,7 +66,7 @@ inline void blake2s_hash_end(uint32_t *output, const uint32_t *input) } __host__ -void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate) +void blake2s_setBlock(uint32_t *penddata, blake2s_state *pstate) { #ifndef GPU_MIDSTATE CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, penddata, 80, 0, cudaMemcpyHostToDevice)); @@ -73,31 +76,18 @@ void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate) } __device__ __forceinline__ -uint32_t gpu_load32(const void *src) { - return *(uint32_t *)(src); +uint64_t gpu_load64(void *src) { + return *(uint64_t*)(src); } __device__ __forceinline__ void gpu_store32(void *dst, uint32_t dw) { - *(uint32_t *)(dst) = dw; + *(uint32_t*)(dst) = dw; } __device__ __forceinline__ void gpu_store64(void *dst, uint64_t lw) { - *(uint64_t *)(dst) = lw; -} - -__device__ __forceinline__ -uint64_t gpu_load48(const void *src) -{ - const uint8_t *p = (const uint8_t *)src; - uint64_t w = *p++; - w |= (uint64_t)(*p++) << 8; - w |= (uint64_t)(*p++) << 16; - w |= (uint64_t)(*p++) << 24; - w |= (uint64_t)(*p++) << 32; - w |= (uint64_t)(*p++) << 40; - return w; + *(uint64_t*)(dst) = lw; } __device__ __forceinline__ @@ -202,10 +192,11 @@ void gpu_blake2s_compress(blake2s_state *S, const uint32_t *block) for(int i = 0; i < 8; i++) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; -#undef G -#undef ROUND + #undef G + #undef ROUND } +/* unused but kept as reference */ __device__ __forceinline__ void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen) { @@ -236,10 +227,10 @@ void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen) } __device__ __forceinline__ -void gpu_blake2s_update76(blake2s_state *S, const void *in) +void gpu_blake2s_update76(blake2s_state *S, const void *input) { uint64_t *b64 = (uint64_t*) S->buf; - uint64_t *i64 = (uint64_t*) in; + uint64_t *i64 = (uint64_t*) input; #pragma unroll for (int i=0; i < 80/8; i++) b64[i] = i64[i]; @@ -295,14 +286,14 @@ void gpu_blake2s_init_param(blake2s_state *S, const blake2s_param *P) S->buflen = 0; #pragma unroll - for (int i = 0; i < sizeof(S->buf)/4; i++) - gpu_store32(S->buf + (4*i), 0); + for (int i = 0; i < sizeof(S->buf)/8; i++) + gpu_store64(S->buf + (8*i), 0); - uint32_t *p = (uint32_t*) P; + uint64_t *p = (uint64_t*) P; /* IV XOR ParamBlock */ - for (int i = 0; i < 8; i++) - S->h[i] ^= gpu_load32(&p[i]); + for (int i = 0; i < 4; i++) + S->h[i] ^= gpu_load64(&p[i]); } // Sequential blake2s initialization @@ -373,7 +364,7 @@ void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_ } __host__ -uint32_t blake2s_host_hash(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2) +uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2) { uint32_t result = UINT32_MAX; @@ -406,14 +397,15 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc const uint32_t first_nonce = pdata[19]; int dev_id = device_map[thr_id]; - int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 26 : 22; + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25; if (device_sm[dev_id] < 350) intensity = 22; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) { - ptarget[7] = swab32(0xff); + ptarget[6] = swab32(0xFFFF0); + ptarget[7] = 0; } if (!init[thr_id]) @@ -442,13 +434,14 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN); memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state)); - blake2s_cpu_setBlock(endiandata, &s_midstate); + blake2s_setBlock(endiandata, &s_midstate); - uint2 gpu_target = make_uint2(ptarget[7], ptarget[6]); - const uint32_t Htarg = ptarget[7]; + const uint2 target = make_uint2(ptarget[7], ptarget[6]); do { - uint32_t foundNonce = blake2s_host_hash(thr_id, throughput, pdata[19], gpu_target); + uint32_t foundNonce = blake2s_hash_cuda(thr_id, throughput, pdata[19], target); + + *hashes_done = pdata[19] - first_nonce + throughput; if (foundNonce != UINT32_MAX) { @@ -458,15 +451,14 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc le32enc(&endiandata[19], foundNonce); blake2s_hash_end(vhashcpu, endiandata); - if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { + if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) { work_set_target_ratio(work, vhashcpu); - *hashes_done = pdata[19] + throughput - first_nonce + 1; pdata[19] = work->nonces[0] = swab32(foundNonce); #if NBN > 1 if (extra_results[0] != UINT32_MAX) { le32enc(&endiandata[19], extra_results[0]); blake2s_hash_end(vhashcpu, endiandata); - if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { + if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) { work->nonces[1] = swab32(extra_results[0]); if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) { work_set_target_ratio(work, vhashcpu); @@ -490,9 +482,7 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc } while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + pdata[19]); - *hashes_done = pdata[19] - first_nonce + 1; - - MyStreamSynchronize(NULL, 0, device_map[thr_id]); + *hashes_done = pdata[19] - first_nonce; return 0; } diff --git a/README.txt b/README.txt index a30d6c9..22faef5 100644 --- a/README.txt +++ b/README.txt @@ -238,9 +238,8 @@ features. >>> RELEASE HISTORY <<< - Mar. 12th 2015 v1.7.5 - Blake2S Algo - ... + Mar. 13th 2015 v1.7.5 + Blake2S Algo (NEVA/OXEN) Feb. 28th 2015 v1.7.4 (1.7.3 was a preview, not official) Decred simplified stratum (getwork over stratum) diff --git a/ccminer.cpp b/ccminer.cpp index fd6c6a0..7344c40 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -1713,6 +1713,9 @@ static void *miner_thread(void *userdata) // and make an unique work (extradata) nonceptr[1] += 1; nonceptr[2] |= thr_id; + } else if (opt_benchmark) { + // randomize work + nonceptr[-1] += 1; } pthread_mutex_unlock(&g_work_lock); @@ -1826,11 +1829,11 @@ static void *miner_thread(void *userdata) if (max64 < minmax) { switch (opt_algo) { case ALGO_BLAKECOIN: + case ALGO_BLAKE2S: case ALGO_VANILLA: minmax = 0x80000000U; break; case ALGO_BLAKE: - case ALGO_BLAKE2S: case ALGO_BMW: case ALGO_DECRED: //case ALGO_WHIRLPOOLX: diff --git a/res/ccminer.rc b/res/ccminer.rc index 0a6153b..c5514c8 100644 --- a/res/ccminer.rc +++ b/res/ccminer.rc @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" // VS_VERSION_INFO VERSIONINFO - FILEVERSION 1,7,4,0 - PRODUCTVERSION 1,7,4,0 + FILEVERSION 1,7,5,0 + PRODUCTVERSION 1,7,5,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x21L @@ -76,10 +76,10 @@ BEGIN BEGIN BLOCK "040904e4" BEGIN - VALUE "FileVersion", "1.7.4" + VALUE "FileVersion", "1.7.5" VALUE "LegalCopyright", "Copyright (C) 2016" VALUE "ProductName", "ccminer" - VALUE "ProductVersion", "1.7.4" + VALUE "ProductVersion", "1.7.5" END END BLOCK "VarFileInfo"