From 7ffe65c262645b9542cd11ec5d3f36eec8befa49 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 9 Mar 2016 22:58:47 +0000 Subject: [PATCH] blake2s algo Signed-off-by: Tanguy Pruvot --- Algo256/blake2s.cu | 514 ++++++++++++++++++++++++++++++++++++++++ Makefile.am | 1 + README.txt | 7 +- algos.h | 2 + bench.cpp | 1 + ccminer.cpp | 8 +- ccminer.vcxproj | 2 + ccminer.vcxproj.filters | 6 + configure.ac | 2 +- cpuminer-config.h | 6 +- miner.h | 3 + sph/blake2s.c | 387 ++++++++++++++++++++++++++++++ sph/blake2s.h | 150 ++++++++++++ util.cpp | 3 + 14 files changed, 1086 insertions(+), 6 deletions(-) create mode 100644 Algo256/blake2s.cu create mode 100644 sph/blake2s.c create mode 100644 sph/blake2s.h diff --git a/Algo256/blake2s.cu b/Algo256/blake2s.cu new file mode 100644 index 0000000..4851abb --- /dev/null +++ b/Algo256/blake2s.cu @@ -0,0 +1,514 @@ +#include +#include +#include +#include + +#include "miner.h" + +#define NATIVE_LITTLE_ENDIAN + +extern "C" { +#include +} + +static __thread blake2s_state ALIGN(64) s_midstate; +static __thread blake2s_state ALIGN(64) s_ctx; + +//#define GPU_MIDSTATE +#define MIDLEN 76 +#define A 64 + +#include "cuda_helper.h" + +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, b) x +#endif + +#ifndef GPU_MIDSTATE +__constant__ uint32_t d_data[20]; +#else +__constant__ blake2s_state ALIGN(8) d_state[1]; +#endif + +/* 16 adapters max */ +static uint32_t *d_resNonce[MAX_GPUS]; +static uint32_t *h_resNonce[MAX_GPUS]; + +/* threads per block */ +#define TPB 512 + +/* max count of found nonces in one call */ +#define NBN 2 +static uint32_t extra_results[NBN] = { UINT32_MAX }; + +extern "C" void blake2s_hash(void *output, const void *input) +{ + uint8_t _ALIGN(A) hash[BLAKE2S_OUTBYTES]; + blake2s_state blake2_ctx; + + blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES); + blake2s_update(&blake2_ctx, (uint8_t*) input, 80); + blake2s_final(&blake2_ctx, hash, BLAKE2S_OUTBYTES); + + memcpy(output, hash, 32); +} + +__host__ +inline void blake2s_hash_end(uint32_t *output, const uint32_t *input) +{ + s_ctx.buflen = MIDLEN; + memcpy(&s_ctx, &s_midstate, 32 + 16 + MIDLEN); + blake2s_update(&s_ctx, (uint8_t*) &input[MIDLEN/4], 80-MIDLEN); + blake2s_final(&s_ctx, (uint8_t*) output, BLAKE2S_OUTBYTES); +} + +__host__ +void blake2s_cpu_setBlock(uint32_t *penddata, blake2s_state *pstate) +{ +#ifndef GPU_MIDSTATE + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, penddata, 80, 0, cudaMemcpyHostToDevice)); +#else + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_state, pstate, sizeof(blake2s_state), 0, cudaMemcpyHostToDevice)); +#endif +} + +__device__ __forceinline__ +uint32_t gpu_load32(const void *src) { + return *(uint32_t *)(src); +} + +__device__ __forceinline__ +void gpu_store32(void *dst, uint32_t dw) { + *(uint32_t *)(dst) = dw; +} + +__device__ __forceinline__ +void gpu_store64(void *dst, uint64_t lw) { + *(uint64_t *)(dst) = lw; +} + +__device__ __forceinline__ +uint64_t gpu_load48(const void *src) +{ + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + return w; +} + +__device__ __forceinline__ +void gpu_blake2s_set_lastnode(blake2s_state *S) { + S->f[1] = ~0U; +} + +__device__ __forceinline__ +void gpu_blake2s_clear_lastnode(blake2s_state *S) { + S->f[1] = 0U; +} + +__device__ __forceinline__ +void gpu_blake2s_increment_counter(blake2s_state *S, const uint32_t inc) +{ + S->t[0] += inc; + S->t[1] += ( S->t[0] < inc ); +} + +__device__ __forceinline__ +void gpu_blake2s_set_lastblock(blake2s_state *S) +{ + if (S->last_node) gpu_blake2s_set_lastnode(S); + S->f[0] = ~0U; +} + +__device__ +void gpu_blake2s_compress(blake2s_state *S, const uint32_t *block) +{ + uint32_t m[16]; + uint32_t v[16]; + + const uint32_t blake2s_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL + }; + + const uint8_t blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, + }; + + #pragma unroll + for(int i = 0; i < 16; i++) + m[i] = block[i]; + + #pragma unroll + for(int i = 0; i < 8; i++) + v[i] = S->h[i]; + + v[ 8] = blake2s_IV[0]; + v[ 9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; + v[11] = blake2s_IV[3]; + v[12] = S->t[0] ^ blake2s_IV[4]; + v[13] = S->t[1] ^ blake2s_IV[5]; + v[14] = S->f[0] ^ blake2s_IV[6]; + v[15] = S->f[1] ^ blake2s_IV[7]; + + #define G(r,i,a,b,c,d) { \ + a += b + m[blake2s_sigma[r][2*i+0]]; \ + d = __byte_perm(d ^ a, 0, 0x1032); /* d = ROTR32(d ^ a, 16); */ \ + c = c + d; \ + b = ROTR32(b ^ c, 12); \ + a += b + m[blake2s_sigma[r][2*i+1]]; \ + d = __byte_perm(d ^ a, 0, 0x0321); /* ROTR32(d ^ a, 8); */ \ + c = c + d; \ + b = ROTR32(b ^ c, 7); \ + } + + #define ROUND(r) { \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ + } + + ROUND( 0 ); + ROUND( 1 ); + ROUND( 2 ); + ROUND( 3 ); + ROUND( 4 ); + ROUND( 5 ); + ROUND( 6 ); + ROUND( 7 ); + ROUND( 8 ); + ROUND( 9 ); + + #pragma unroll + for(int i = 0; i < 8; i++) + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND +} + +__device__ __forceinline__ +void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen) +{ + while(inlen > 0) + { + const int left = S->buflen; + size_t fill = 2 * BLAKE2S_BLOCKBYTES - left; + if(inlen > fill) + { + memcpy(S->buf + left, in, fill); // Fill buffer + S->buflen += fill; + + gpu_blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES); + gpu_blake2s_compress(S, (uint32_t*) S->buf); // Compress + memcpy(S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES); // Shift buffer left + S->buflen -= BLAKE2S_BLOCKBYTES; + in += fill; + inlen -= fill; + } + else // inlen <= fill + { + memcpy(S->buf + left, in, (size_t) inlen); + S->buflen += (size_t) inlen; // Be lazy, do not compress + in += inlen; + inlen -= inlen; + } + } +} + +__device__ __forceinline__ +void gpu_blake2s_update76(blake2s_state *S, const void *in) +{ + uint64_t *b64 = (uint64_t*) S->buf; + uint64_t *i64 = (uint64_t*) in; + #pragma unroll + for (int i=0; i < 80/8; i++) + b64[i] = i64[i]; + //S->buflen = 76; +} + +__device__ __forceinline__ +void gpu_blake2s_update_nonce(blake2s_state *S, const uint32_t nonce) +{ + gpu_store32(&S->buf[76], nonce); + S->buflen = 80; +} + +__device__ __forceinline__ +void gpu_blake2s_final(blake2s_state *S, uint32_t *out) +{ + //if (S->buflen > BLAKE2S_BLOCKBYTES) + { + gpu_blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES); + gpu_blake2s_compress(S, (uint32_t*) S->buf); + S->buflen -= BLAKE2S_BLOCKBYTES; + //memcpy(S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen); + } + + gpu_blake2s_increment_counter(S, (uint32_t)S->buflen); + gpu_blake2s_set_lastblock(S); + //memset(&S->buf[S->buflen], 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen); /* Padding */ + gpu_blake2s_compress(S, (uint32_t*) (S->buf + BLAKE2S_BLOCKBYTES)); + + #pragma unroll + for (int i = 0; i < 8; i++) + out[i] = S->h[i]; +} + +/* init2 xors IV with input parameter block */ +__device__ __forceinline__ +void gpu_blake2s_init_param(blake2s_state *S, const blake2s_param *P) +{ + //blake2s_IV + S->h[0] = 0x6A09E667UL; + S->h[1] = 0xBB67AE85UL; + S->h[2] = 0x3C6EF372UL; + S->h[3] = 0xA54FF53AUL; + S->h[4] = 0x510E527FUL; + S->h[5] = 0x9B05688CUL; + S->h[6] = 0x1F83D9ABUL; + S->h[7] = 0x5BE0CD19UL; + + S->t[0] = 0; S->t[1] = 0; + S->f[0] = 0; S->f[1] = 0; + S->last_node = 0; + + S->buflen = 0; + + #pragma unroll + for (int i = 0; i < sizeof(S->buf)/4; i++) + gpu_store32(S->buf + (4*i), 0); + + uint32_t *p = (uint32_t*) P; + + /* IV XOR ParamBlock */ + for (int i = 0; i < 8; i++) + S->h[i] ^= gpu_load32(&p[i]); +} + +// Sequential blake2s initialization +__device__ __forceinline__ +void gpu_blake2s_init(blake2s_state *S, const uint8_t outlen) +{ + blake2s_param P[1]; + + // if (!outlen || outlen > BLAKE2S_OUTBYTES) return; + + P->digest_length = outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + + P->leaf_length = 0; + gpu_store64(P->node_offset, 0); + //P->node_depth = 0; + //P->inner_length = 0; + + gpu_store64(&P->salt, 0); + gpu_store64(&P->personal, 0); + + gpu_blake2s_init_param(S, P); +} + +__device__ __forceinline__ +void gpu_copystate(blake2s_state *dst, blake2s_state *src) +{ + uint64_t* d64 = (uint64_t*) dst; + uint64_t* s64 = (uint64_t*) src; + #pragma unroll + for (int i=0; i < (32 + 16 + 2 * BLAKE2S_BLOCKBYTES)/8; i++) + gpu_store64(&d64[i], s64[i]); + dst->buflen = src->buflen; + dst->last_node = src->last_node; +} + +__global__ +void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t nonce = startNonce + thread; + blake2s_state ALIGN(8) blake2_ctx; + +#ifndef GPU_MIDSTATE + gpu_blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES); + //gpu_blake2s_update(&blake2_ctx, (uint8_t*) d_data, 76); + gpu_blake2s_update76(&blake2_ctx, (uint64_t*) d_data); +#else + gpu_copystate(&blake2_ctx, &d_state[0]); +#endif + gpu_blake2s_update_nonce(&blake2_ctx, nonce); + + uint32_t hash[8]; + gpu_blake2s_final(&blake2_ctx, hash); + + if (hash[7] <= target2.x && hash[6] <= target2.y) { +#if NBN == 2 + if (resNonce[0] != UINT32_MAX) + resNonce[1] = nonce; + else + resNonce[0] = nonce; +#else + resNonce[0] = nonce; +#endif + } +} + +__host__ +uint32_t blake2s_host_hash(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2) +{ + uint32_t result = UINT32_MAX; + + dim3 grid((threads + TPB-1)/TPB); + dim3 block(TPB); + + /* Check error on Ctrl+C or kill to prevent segfaults on exit */ + if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) + return result; + + blake2s_gpu_hash <<>> (threads, startNonce, d_resNonce[thr_id], target2); + cudaThreadSynchronize(); + + if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + result = h_resNonce[thr_id][0]; + for (int n=0; n < (NBN-1); n++) + extra_results[n] = h_resNonce[thr_id][n+1]; + } + return result; +} + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(64) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t first_nonce = pdata[19]; + + int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 26 : 22; + if (device_sm[dev_id] < 350) intensity = 22; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) { + ptarget[7] = swab32(0xff); + } + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage (linux) + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_LOG_ERROR(); + } + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1); + CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), -1); + init[thr_id] = true; + } + + for (int i=0; i < 19; i++) { + be32enc(&endiandata[i], pdata[i]); + } + + // midstate + memset(s_midstate.buf, 0, sizeof(s_midstate.buf)); + blake2s_init(&s_midstate, BLAKE2S_OUTBYTES); + blake2s_update(&s_midstate, (uint8_t*) endiandata, MIDLEN); + memcpy(&s_ctx, &s_midstate, sizeof(blake2s_state)); + + blake2s_cpu_setBlock(endiandata, &s_midstate); + + uint2 gpu_target = make_uint2(ptarget[7], ptarget[6]); + const uint32_t Htarg = ptarget[7]; + + do { + uint32_t foundNonce = blake2s_host_hash(thr_id, throughput, pdata[19], gpu_target); + + if (foundNonce != UINT32_MAX) + { + uint32_t _ALIGN(A) vhashcpu[8]; + + //blake2s_hash(vhashcpu, endiandata); + le32enc(&endiandata[19], foundNonce); + blake2s_hash_end(vhashcpu, endiandata); + + if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { + work_set_target_ratio(work, vhashcpu); + *hashes_done = pdata[19] + throughput - first_nonce + 1; + pdata[19] = work->nonces[0] = swab32(foundNonce); +#if NBN > 1 + if (extra_results[0] != UINT32_MAX) { + le32enc(&endiandata[19], extra_results[0]); + blake2s_hash_end(vhashcpu, endiandata); + if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { + work->nonces[1] = swab32(extra_results[0]); + if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) { + work_set_target_ratio(work, vhashcpu); + xchg(work->nonces[1], pdata[19]); + } + return 2; + } + extra_results[0] = UINT32_MAX; + } +#endif + return 1; + } else { + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); + applog_hex(pdata, 80); + applog_hex(ptarget, 32); + applog_hex(vhashcpu, 32); + } + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + pdata[19]); + + *hashes_done = pdata[19] - first_nonce + 1; + + MyStreamSynchronize(NULL, 0, device_map[thr_id]); + + return 0; +} + +// cleanup +extern "C" void free_blake2s(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaDeviceSynchronize(); + + cudaFreeHost(h_resNonce[thr_id]); + cudaFree(d_resNonce[thr_id]); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/Makefile.am b/Makefile.am index 1545619..9b80922 100644 --- a/Makefile.am +++ b/Makefile.am @@ -37,6 +37,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \ + Algo256/blake2s.cu sph/blake2s.c \ Algo256/bmw.cu Algo256/cuda_bmw.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ diff --git a/README.txt b/README.txt index 995bfbc..a30d6c9 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.7.4 (Feb 2015) "Decred Stratum and MrM4D VNL" +ccMiner release 1.7.5 (Mar 2015) "Blake2-S" --------------------------------------------------------------- *************************************************************** @@ -70,6 +70,7 @@ its command line interface and options. -a, --algo=ALGO specify the algorithm to use blake use to mine Saffroncoin (Blake256) blakecoin use to mine Old Blake 256 + blake2s use to mine Nevacoin (Blake2-S 256) bmw use to mine Midnight c11/flax use to mine Chaincoin and Flax decred use to mine Decred 180 bytes Blake256-14 @@ -237,6 +238,10 @@ features. >>> RELEASE HISTORY <<< + Mar. 12th 2015 v1.7.5 + Blake2S Algo + ... + Feb. 28th 2015 v1.7.4 (1.7.3 was a preview, not official) Decred simplified stratum (getwork over stratum) Vanilla kernel by MrMad diff --git a/algos.h b/algos.h index a0cefb0..6e116b2 100644 --- a/algos.h +++ b/algos.h @@ -7,6 +7,7 @@ enum sha_algos { ALGO_BLAKECOIN = 0, ALGO_BLAKE, + ALGO_BLAKE2S, ALGO_BMW, ALGO_C11, ALGO_DEEP, @@ -53,6 +54,7 @@ extern volatile enum sha_algos opt_algo; static const char *algo_names[] = { "blakecoin", "blake", + "blake2s", "bmw", "c11", "deep", diff --git a/bench.cpp b/bench.cpp index c203f27..da8eecd 100644 --- a/bench.cpp +++ b/bench.cpp @@ -45,6 +45,7 @@ void algo_free_all(int thr_id) { // only initialized algos will be freed free_blake256(thr_id); + free_blake2s(thr_id); free_bmw(thr_id); free_c11(thr_id); free_decred(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index ecdff2a..fd6c6a0 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -209,6 +209,7 @@ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the hash algorithm to use\n\ blake Blake 256 (SFR)\n\ + blake2s Blake2-S 256 (NEVA)\n\ blakecoin Fast Blake 256 (8 rounds)\n\ bmw BMW 256\n\ c11/flax X11 variant\n\ @@ -803,6 +804,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) break; case ALGO_BLAKE: case ALGO_BLAKECOIN: + case ALGO_BLAKE2S: case ALGO_BMW: case ALGO_VANILLA: // fast algos require that... @@ -1828,6 +1830,7 @@ static void *miner_thread(void *userdata) minmax = 0x80000000U; break; case ALGO_BLAKE: + case ALGO_BLAKE2S: case ALGO_BMW: case ALGO_DECRED: //case ALGO_WHIRLPOOLX: @@ -1911,6 +1914,9 @@ static void *miner_thread(void *userdata) case ALGO_BLAKE: rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 14); break; + case ALGO_BLAKE2S: + rc = scanhash_blake2s(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_BMW: rc = scanhash_bmw(thr_id, &work, max_nonce, &hashes_done); break; @@ -2041,7 +2047,7 @@ static void *miner_thread(void *userdata) // todo: update all algos to use work->nonces work.nonces[0] = nonceptr[0]; - if (opt_algo != ALGO_DECRED) { + if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S) { work.nonces[1] = nonceptr[2]; } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 3d9b559..c072aaa 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -262,6 +262,7 @@ + @@ -408,6 +409,7 @@ --ptxas-options="-dlcm=cg" %(AdditionalOptions) true + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 7f24a1d..42c1287 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -258,6 +258,9 @@ Source Files + + Source Files\sph + @@ -706,6 +709,9 @@ Source Files\CUDA\lyra2 + + Source Files\CUDA\Algo256 + diff --git a/configure.ac b/configure.ac index 53ae463..7f2b345 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [1.7.4], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [1.7.5], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpuminer-config.h b/cpuminer-config.h index 64783ce..fff627b 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -162,7 +162,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 1.7.4" +#define PACKAGE_STRING "ccminer 1.7.5" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "ccminer" @@ -171,7 +171,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.7.4" +#define PACKAGE_VERSION "1.7.5" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be @@ -185,7 +185,7 @@ #define STDC_HEADERS 1 /* Version number of package */ -#define VERSION "1.7.4" +#define VERSION "1.7.5" /* Define curl_free() as free() if our version of curl lacks curl_free. */ /* #undef curl_free */ diff --git a/miner.h b/miner.h index 2f99bf9..fc2bdba 100644 --- a/miner.h +++ b/miner.h @@ -262,6 +262,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len); struct work; extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds); +extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -304,6 +305,7 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc void algo_free_all(int thr_id); extern void free_blake256(int thr_id); +extern void free_blake2s(int thr_id); extern void free_bmw(int thr_id); extern void free_c11(int thr_id); extern void free_decred(int thr_id); @@ -773,6 +775,7 @@ void applog_compare_hash(void *hash, void *hash_ref); void print_hash_tests(void); void blake256hash(void *output, const void *input, int8_t rounds); +void blake2s_hash(void *output, const void *input); void bmw_hash(void *state, const void *input); void c11hash(void *output, const void *input); void decred_hash(void *state, const void *input); diff --git a/sph/blake2s.c b/sph/blake2s.c new file mode 100644 index 0000000..62b5a39 --- /dev/null +++ b/sph/blake2s.c @@ -0,0 +1,387 @@ +/** + * BLAKE2 reference source code package - reference C implementations + * + * Written in 2012 by Samuel Neves + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide. This software is distributed without any warranty. + * + * You should have received a copy of the CC0 Public Domain Dedication along with + * this software. If not, see . + */ + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#include "sph_types.h" +#include "blake2s.h" + +static const uint32_t blake2s_IV[8] = +{ + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL +}; + +static const uint8_t blake2s_sigma[10][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , +}; + +static inline int blake2s_set_lastnode( blake2s_state *S ) +{ + S->f[1] = ~0U; + return 0; +} + +static inline int blake2s_clear_lastnode( blake2s_state *S ) +{ + S->f[1] = 0U; + return 0; +} + +/* Some helper functions, not necessarily useful */ +static inline int blake2s_set_lastblock( blake2s_state *S ) +{ + if( S->last_node ) blake2s_set_lastnode( S ); + + S->f[0] = ~0U; + return 0; +} + +static inline int blake2s_clear_lastblock( blake2s_state *S ) +{ + if( S->last_node ) blake2s_clear_lastnode( S ); + + S->f[0] = 0U; + return 0; +} + +static inline int blake2s_increment_counter( blake2s_state *S, const uint32_t inc ) +{ + S->t[0] += inc; + S->t[1] += ( S->t[0] < inc ); + return 0; +} + +// Parameter-related functions +static inline int blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length ) +{ + P->digest_length = digest_length; + return 0; +} + +static inline int blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout ) +{ + P->fanout = fanout; + return 0; +} + +static inline int blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth ) +{ + P->depth = depth; + return 0; +} + +static inline int blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length ) +{ + store32( &P->leaf_length, leaf_length ); + return 0; +} + +static inline int blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset ) +{ + store48( P->node_offset, node_offset ); + return 0; +} + +static inline int blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth ) +{ + P->node_depth = node_depth; + return 0; +} + +static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length ) +{ + P->inner_length = inner_length; + return 0; +} + +static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] ) +{ + memcpy( P->salt, salt, BLAKE2S_SALTBYTES ); + return 0; +} + +static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] ) +{ + memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES ); + return 0; +} + +static inline int blake2s_init0( blake2s_state *S ) +{ + memset( S, 0, sizeof( blake2s_state ) ); + + for( int i = 0; i < 8; ++i ) S->h[i] = blake2s_IV[i]; + + return 0; +} + +/* init2 xors IV with input parameter block */ +int blake2s_init_param( blake2s_state *S, const blake2s_param *P ) +{ + blake2s_init0( S ); + uint32_t *p = ( uint32_t * )( P ); + + /* IV XOR ParamBlock */ + for( size_t i = 0; i < 8; ++i ) + S->h[i] ^= load32( &p[i] ); + + return 0; +} + + +// Sequential blake2s initialization +int blake2s_init( blake2s_state *S, const uint8_t outlen ) +{ + blake2s_param P[1]; + + /* Move interval verification here? */ + if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; + + P->digest_length = outlen; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store48( &P->node_offset, 0 ); + P->node_depth = 0; + P->inner_length = 0; + // memset(P->reserved, 0, sizeof(P->reserved) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + return blake2s_init_param( S, P ); +} + +int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ) +{ + blake2s_param P[1]; + + if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1; + + if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1; + + P->digest_length = outlen; + P->key_length = keylen; + P->fanout = 1; + P->depth = 1; + store32( &P->leaf_length, 0 ); + store48( &P->node_offset, 0 ); + P->node_depth = 0; + P->inner_length = 0; + // memset(P->reserved, 0, sizeof(P->reserved) ); + memset( P->salt, 0, sizeof( P->salt ) ); + memset( P->personal, 0, sizeof( P->personal ) ); + + if( blake2s_init_param( S, P ) < 0 ) return -1; + + { + uint8_t block[BLAKE2S_BLOCKBYTES]; + memset( block, 0, BLAKE2S_BLOCKBYTES ); + memcpy( block, key, keylen ); + blake2s_update( S, block, BLAKE2S_BLOCKBYTES ); + secure_zero_memory( block, BLAKE2S_BLOCKBYTES ); /* Burn the key from stack */ + } + return 0; +} + +int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) +{ + uint32_t m[16]; + uint32_t v[16]; + + for( size_t i = 0; i < 16; ++i ) + m[i] = load32( block + i * sizeof( m[i] ) ); + + for( size_t i = 0; i < 8; ++i ) + v[i] = S->h[i]; + + v[ 8] = blake2s_IV[0]; + v[ 9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; + v[11] = blake2s_IV[3]; + v[12] = S->t[0] ^ blake2s_IV[4]; + v[13] = S->t[1] ^ blake2s_IV[5]; + v[14] = S->f[0] ^ blake2s_IV[6]; + v[15] = S->f[1] ^ blake2s_IV[7]; +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2s_sigma[r][2*i+0]]; \ + d = SPH_ROTR32(d ^ a, 16); \ + c = c + d; \ + b = SPH_ROTR32(b ^ c, 12); \ + a = a + b + m[blake2s_sigma[r][2*i+1]]; \ + d = SPH_ROTR32(d ^ a, 8); \ + c = c + d; \ + b = SPH_ROTR32(b ^ c, 7); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ + } while(0) + ROUND( 0 ); + ROUND( 1 ); + ROUND( 2 ); + ROUND( 3 ); + ROUND( 4 ); + ROUND( 5 ); + ROUND( 6 ); + ROUND( 7 ); + ROUND( 8 ); + ROUND( 9 ); + + for( size_t i = 0; i < 8; ++i ) + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND + return 0; +} + + +int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ) +{ + while( inlen > 0 ) + { + size_t left = S->buflen; + size_t fill = 2 * BLAKE2S_BLOCKBYTES - left; + + if( inlen > fill ) + { + memcpy( S->buf + left, in, fill ); // Fill buffer + S->buflen += fill; + blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES ); + blake2s_compress( S, S->buf ); // Compress + memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left + S->buflen -= BLAKE2S_BLOCKBYTES; + in += fill; + inlen -= fill; + } + else // inlen <= fill + { + memcpy(S->buf + left, in, (size_t) inlen); + S->buflen += (size_t) inlen; // Be lazy, do not compress + in += inlen; + inlen -= inlen; + } + } + + return 0; +} + +int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen ) +{ + uint8_t buffer[BLAKE2S_OUTBYTES]; + + if( S->buflen > BLAKE2S_BLOCKBYTES ) + { + blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES ); + blake2s_compress( S, S->buf ); + S->buflen -= BLAKE2S_BLOCKBYTES; + memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen ); + } + + blake2s_increment_counter( S, ( uint32_t )S->buflen ); + blake2s_set_lastblock( S ); + memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */ + blake2s_compress( S, S->buf ); + + for( int i = 0; i < 8; ++i ) /* Output full hash to temp buffer */ + store32( buffer + sizeof( S->h[i] ) * i, S->h[i] ); + + memcpy( out, buffer, outlen ); + return 0; +} + +int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) +{ + blake2s_state S[1]; + + /* Verify parameters */ + if ( NULL == in ) return -1; + + if ( NULL == out ) return -1; + + if ( NULL == key ) keylen = 0; /* Fail here instead if keylen != 0 and key == NULL? */ + + if( keylen > 0 ) + { + if( blake2s_init_key( S, outlen, key, keylen ) < 0 ) return -1; + } + else + { + if( blake2s_init( S, outlen ) < 0 ) return -1; + } + + blake2s_update( S, ( uint8_t * )in, inlen ); + blake2s_final( S, out, outlen ); + return 0; +} + +#if defined(__cplusplus) +} +#endif + + +#if defined(BLAKE2S_SELFTEST) +#include +#include "blake2-kat.h" /* test data not included */ +int main( int argc, char **argv ) +{ + uint8_t key[BLAKE2S_KEYBYTES]; + uint8_t buf[KAT_LENGTH]; + + for( size_t i = 0; i < BLAKE2S_KEYBYTES; ++i ) + key[i] = ( uint8_t )i; + + for( size_t i = 0; i < KAT_LENGTH; ++i ) + buf[i] = ( uint8_t )i; + + for( size_t i = 0; i < KAT_LENGTH; ++i ) + { + uint8_t hash[BLAKE2S_OUTBYTES]; + blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES ); + + if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) ) + { + puts( "error" ); + return -1; + } + } + + puts( "ok" ); + return 0; +} +#endif diff --git a/sph/blake2s.h b/sph/blake2s.h new file mode 100644 index 0000000..64aa25b --- /dev/null +++ b/sph/blake2s.h @@ -0,0 +1,150 @@ +/** + * BLAKE2 reference source code package - reference C implementations + * + * Written in 2012 by Samuel Neves + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide. This software is distributed without any warranty. + * + * You should have received a copy of the CC0 Public Domain Dedication along with + * this software. If not, see . + */ +#pragma once +#ifndef __BLAKE2_H__ +#define __BLAKE2_H__ + +#include +#include + +#if defined(_MSC_VER) +#include +#define inline __inline +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__((aligned(x))) +#endif + +/* blake2-impl.h */ + +static inline uint32_t load32(const void *src) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + return *(uint32_t *)(src); +#else + const uint8_t *p = (uint8_t *)src; + uint32_t w = *p++; + w |= (uint32_t)(*p++) << 8; + w |= (uint32_t)(*p++) << 16; + w |= (uint32_t)(*p++) << 24; + return w; +#endif +} + +static inline void store32(void *dst, uint32_t w) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + *(uint32_t *)(dst) = w; +#else + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; w >>= 8; + *p++ = (uint8_t)w; w >>= 8; + *p++ = (uint8_t)w; w >>= 8; + *p++ = (uint8_t)w; +#endif +} + +static inline uint64_t load48(const void *src) +{ + const uint8_t *p = (const uint8_t *)src; + uint64_t w = *p++; + w |= (uint64_t)(*p++) << 8; + w |= (uint64_t)(*p++) << 16; + w |= (uint64_t)(*p++) << 24; + w |= (uint64_t)(*p++) << 32; + w |= (uint64_t)(*p++) << 40; + return w; +} + +static inline void store48(void *dst, uint64_t w) +{ + uint8_t *p = (uint8_t *)dst; + *p++ = (uint8_t)w; w >>= 8; + *p++ = (uint8_t)w; w >>= 8; + *p++ = (uint8_t)w; w >>= 8; + *p++ = (uint8_t)w; w >>= 8; + *p++ = (uint8_t)w; w >>= 8; + *p++ = (uint8_t)w; +} + +/* prevents compiler optimizing out memset() */ +static inline void secure_zero_memory(void *v, size_t n) +{ + volatile uint8_t *p = ( volatile uint8_t * )v; + + while( n-- ) *p++ = 0; +} + +/* blake2.h */ + +enum blake2s_constant +{ + BLAKE2S_BLOCKBYTES = 64, + BLAKE2S_OUTBYTES = 32, + BLAKE2S_KEYBYTES = 32, + BLAKE2S_SALTBYTES = 8, + BLAKE2S_PERSONALBYTES = 8 +}; + +#pragma pack(push, 1) +typedef struct __blake2s_param +{ + uint8_t digest_length; // 1 + uint8_t key_length; // 2 + uint8_t fanout; // 3 + uint8_t depth; // 4 + uint32_t leaf_length; // 8 + uint8_t node_offset[6];// 14 + uint8_t node_depth; // 15 + uint8_t inner_length; // 16 + // uint8_t reserved[0]; + uint8_t salt[BLAKE2S_SALTBYTES]; // 24 + uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 +} blake2s_param; + +ALIGN( 64 ) typedef struct __blake2s_state +{ + uint32_t h[8]; + uint32_t t[2]; + uint32_t f[2]; + uint8_t buf[2 * BLAKE2S_BLOCKBYTES]; + size_t buflen; + uint8_t last_node; +} blake2s_state; +#pragma pack(pop) + +#if defined(__cplusplus) +extern "C" { +#endif + + int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ); + + // Streaming API + int blake2s_init( blake2s_state *S, const uint8_t outlen ); + int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int blake2s_init_param( blake2s_state *S, const blake2s_param *P ); + int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen ); + int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen ); + + // Simple API + int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + + // Direct Hash Mining Helpers + #define blake2s_salt32(out, in, inlen, key32) blake2s(out, in, key32, 32, inlen, 32) /* neoscrypt */ + #define blake2s_simple(out, in, inlen) blake2s(out, in, NULL, 32, inlen, 0) + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/util.cpp b/util.cpp index 31988f5..526c9bb 100644 --- a/util.cpp +++ b/util.cpp @@ -1915,6 +1915,9 @@ void print_hash_tests(void) blake256hash(&hash[0], &buf[0], 14); printpfx("blake", hash); + blake2s_hash(&hash[0], &buf[0]); + printpfx("blake2s", hash); + bmw_hash(&hash[0], &buf[0]); printpfx("bmw", hash);