From 42737acf66e09420ac739d345da6ea2c1ede0a12 Mon Sep 17 00:00:00 2001 From: ystarnaud Date: Wed, 19 Nov 2014 12:06:16 +0100 Subject: [PATCH] Added Neoscrypt with Wolf9466 improvements. --- Makefile.am | 1 + algorithm.c | 41 + algorithm.h | 3 +- algorithm/neoscrypt.c | 1411 ++++++++++++++++++++++++++++++ algorithm/neoscrypt.h | 13 + kernel/neoscrypt.cl | 525 +++++++++++ miner.h | 1 + ocl.c | 78 +- sgminer.c | 294 +++++-- winbuild/sgminer.vcxproj | 2 + winbuild/sgminer.vcxproj.filters | 6 + 11 files changed, 2286 insertions(+), 89 deletions(-) create mode 100644 algorithm/neoscrypt.c create mode 100644 algorithm/neoscrypt.h create mode 100644 kernel/neoscrypt.cl diff --git a/Makefile.am b/Makefile.am index 95572036..d6210fdc 100644 --- a/Makefile.am +++ b/Makefile.am @@ -65,6 +65,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h sgminer_SOURCES += algorithm/x14.c algorithm/x14.h sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h +sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h bin_SCRIPTS = $(top_srcdir)/kernel/*.cl diff --git a/algorithm.c b/algorithm.c index 0a321993..69994d5a 100644 --- a/algorithm.c +++ b/algorithm.c @@ -29,6 +29,7 @@ #include "algorithm/bitblock.h" #include "algorithm/x14.h" #include "algorithm/fresh.h" +#include "algorithm/neoscrypt.h" #include "compat.h" @@ -92,6 +93,17 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru strcat(data->binary_filename, buf); } +static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm) +{ + char buf[255]; + sprintf(buf, " -D MAX_GLOBAL_THREADS=%u", + (unsigned int)cgpu->thread_concurrency); + strcat(data->compiler_options, buf); + + sprintf(buf, "tc%u", (unsigned int)cgpu->thread_concurrency); + strcat(data->binary_filename, buf); +} + static void append_x11_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm) { char buf[255]; @@ -140,6 +152,30 @@ static cl_int queue_scrypt_kernel(struct __clState *clState, struct _dev_blk_ctx return status; } +static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; + cl_uint le_target; + cl_int status = 0; + + /* This looks like a unnecessary double cast, but to make sure, that + * the target's most significant entry is adressed as a 32-bit value + * and not accidently by something else the double cast seems wise. + * The compiler will get rid of it anyway. + */ + le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]); + memcpy(clState->cldata, blk->work->data, 80); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(le_target); + + return status; +} + static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) { cl_kernel *kernel = &clState->kernel; @@ -597,6 +633,11 @@ static algorithm_settings_t algos[] = { A_SCRYPT( "zuikkis" ), #undef A_SCRYPT +#define A_NEOSCRYPT(a) \ + { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options} + A_NEOSCRYPT("neoscrypt"), +#undef A_NEOSCRYPT + // kernels starting from this will have difficulty calculated by using quarkcoin algorithm #define A_QUARK(a, b) \ { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options } diff --git a/algorithm.h b/algorithm.h index 21499753..f9e2af1e 100644 --- a/algorithm.h +++ b/algorithm.h @@ -23,7 +23,8 @@ typedef enum { ALGO_TWE, ALGO_FUGUE, ALGO_NIST, - ALGO_FRESH + ALGO_FRESH, + ALGO_NEOSCRYPT } algorithm_type_t; extern const char *algorithm_type_str[]; diff --git a/algorithm/neoscrypt.c b/algorithm/neoscrypt.c new file mode 100644 index 00000000..cec6e5a2 --- /dev/null +++ b/algorithm/neoscrypt.c @@ -0,0 +1,1411 @@ +/* + * Copyright (c) 2009 Colin Percival, 2011 ArtForz + * Copyright (c) 2012 Andrew Moon (floodyberry) + * Copyright (c) 2012 Samuel Neves + * Copyright (c) 2014 John Doering + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "neoscrypt.h" + +#define SCRYPT_BLOCK_SIZE 64 +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 32 + +typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +#define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) +#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + +#define U8TO32_BE(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]))) + +#define U32TO8_BE(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define U64TO8_BE(p, v) \ + U32TO8_BE((p), (uint32_t)((v) >> 32)); \ + U32TO8_BE((p) + 4, (uint32_t)((v) )); + +#if (WINDOWS) +/* sizeof(unsigned long) = 4 for MinGW64 */ +typedef unsigned long long ulong; +#else +typedef unsigned long ulong; +#endif + +typedef unsigned int uint; +typedef unsigned char uchar; +typedef unsigned int ubool; + + +//#define MIN(a, b) ((a) < (b) ? a : b) +//#define MAX(a, b) ((a) > (b) ? a : b) + + +#if (SHA256) + +/* SHA-256 */ + +static const uint32_t sha256_constants[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) +#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) +#define W0(in,i) (U8TO32_BE(&in[i * 4])) +#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) +#define STEP(i) \ + t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ + t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ + r[7] = r[6]; \ + r[6] = r[5]; \ + r[5] = r[4]; \ + r[4] = r[3] + t0; \ + r[3] = r[2]; \ + r[2] = r[1]; \ + r[1] = r[0]; \ + r[0] = t0 + t1; + + +typedef struct sha256_hash_state_t { + uint32_t H[8]; + uint64_t T; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} sha256_hash_state; + + +static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks) { + uint32_t r[8], w[64], t0, t1; + size_t i; + + for(i = 0; i < 8; i++) + r[i] = S->H[i]; + + while(blocks--) { + for(i = 0; i < 16; i++) { + w[i] = W0(in, i); + } + for(i = 16; i < 64; i++) { + w[i] = W1(i); + } + for(i = 0; i < 64; i++) { + STEP(i); + } + for(i = 0; i < 8; i++) { + r[i] += S->H[i]; + S->H[i] = r[i]; + } + S->T += SCRYPT_HASH_BLOCK_SIZE * 8; + in += SCRYPT_HASH_BLOCK_SIZE; + } +} + +static void neoscrypt_hash_init_sha256(sha256_hash_state *S) { + S->H[0] = 0x6a09e667; + S->H[1] = 0xbb67ae85; + S->H[2] = 0x3c6ef372; + S->H[3] = 0xa54ff53a; + S->H[4] = 0x510e527f; + S->H[5] = 0x9b05688c; + S->H[6] = 0x1f83d9ab; + S->H[7] = 0x5be0cd19; + S->T = 0; + S->leftover = 0; +} + +static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if(S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if(S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + sha256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if(blocks) { + sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if(S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash) { + uint64_t t = S->T + (S->leftover * 8); + + S->buffer[S->leftover] = 0x80; + if(S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + sha256_blocks(S, S->buffer, 1); + memset(S->buffer, 0, 56); + } + + U64TO8_BE(S->buffer + 56, t); + sha256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen) { + sha256_hash_state st; + neoscrypt_hash_init_sha256(&st); + neoscrypt_hash_update_sha256(&st, m, mlen); + neoscrypt_hash_finish_sha256(&st, hash); +} + + +/* HMAC for SHA-256 */ + +typedef struct sha256_hmac_state_t { + sha256_hash_state inner, outer; +} sha256_hmac_state; + +static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen) { + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + neoscrypt_hash_init_sha256(&st->inner); + neoscrypt_hash_init_sha256(&st->outer); + + if(keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + neoscrypt_hash_sha256(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); +} + +static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen) { + /* h(inner || m...) */ + neoscrypt_hash_update_sha256(&st->inner, m, mlen); +} + +static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac) { + /* h(inner || m) */ + hash_digest innerhash; + neoscrypt_hash_finish_sha256(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash)); + neoscrypt_hash_finish_sha256(&st->outer, mac); +} + + +/* PBKDF2 for SHA-256 */ + +static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len, + const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) { + sha256_hmac_state hmac_pw, hmac_pw_salt, work; + hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, k, blocks; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for(i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + neoscrypt_hmac_update_sha256(&work, be, 4); + neoscrypt_hmac_finish_sha256(&work, ti); + memcpy(u, ti, sizeof(u)); + + /* T[i] = U1 ^ U2 ^ U3... */ + for(j = 0; j < N - 1; j++) { + /* UX = hmac(password, U{X-1}) */ + work = hmac_pw; + neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE); + neoscrypt_hmac_finish_sha256(&work, u); + + /* T[i] ^= UX */ + for(k = 0; k < sizeof(u); k++) + ti[k] ^= u[k]; + } + + memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len); + output += SCRYPT_HASH_DIGEST_SIZE; + output_len -= SCRYPT_HASH_DIGEST_SIZE; + } +} + +#endif + + +#if (BLAKE256) + +/* BLAKE-256 */ + +const uint8_t blake256_sigma[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, + 14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3, + 11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4, + 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8, + 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13, + 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9, + 12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11, + 13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10, + 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5, + 10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13 ,0, +}; + +const uint32_t blake256_constants[16] = { + 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344,0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, + 0x452821e6, 0x38d01377, 0xbe5466cf, 0x34e90c6c,0xc0ac29b7, 0xc97c50dd, 0x3f84d5b5, 0xb5470917 +}; + +typedef struct blake256_hash_state_t { + uint32_t H[8], T[2]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} blake256_hash_state; + +static void blake256_blocks(blake256_hash_state *S, const uint8_t *in, size_t blocks) { + const uint8_t *sigma, *sigma_end = blake256_sigma + (10 * 16); + uint32_t m[16], v[16], h[8], t[2]; + uint32_t i; + + for(i = 0; i < 8; i++) + h[i] = S->H[i]; + for(i = 0; i < 2; i++) + t[i] = S->T[i]; + + while(blocks--) { + t[0] += 512; + t[1] += (t[0] < 512) ? 1 : 0; + + for(i = 0; i < 8; i++) + v[i] = h[i]; + for(i = 0; i < 4; i++) + v[i + 8] = blake256_constants[i]; + for(i = 0; i < 2; i++) + v[i + 12] = blake256_constants[i+4] ^ t[0]; + for(i = 0; i < 2; i++) + v[i + 14] = blake256_constants[i+6] ^ t[1]; + + for(i = 0; i < 16; i++) + m[i] = U8TO32_BE(&in[i * 4]); + + in += 64; + +#define G(a,b,c,d,e) \ + v[a] += (m[sigma[e+0]] ^ blake256_constants[sigma[e+1]]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a],16); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c],12); \ + v[a] += (m[sigma[e+1]] ^ blake256_constants[sigma[e+0]]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 7); + + for(i = 0, sigma = blake256_sigma; i < 14; i++) { + G(0, 4, 8,12, 0); + G(1, 5, 9,13, 2); + G(2, 6,10,14, 4); + G(3, 7,11,15, 6); + + G(0, 5,10,15, 8); + G(1, 6,11,12,10); + G(2, 7, 8,13,12); + G(3, 4, 9,14,14); + + sigma += 16; + if(sigma == sigma_end) + sigma = blake256_sigma; + } + +#undef G + + for(i = 0; i < 8; i++) + h[i] ^= (v[i] ^ v[i + 8]); + } + + for(i = 0; i < 8; i++) + S->H[i] = h[i]; + for(i = 0; i < 2; i++) + S->T[i] = t[i]; +} + +static void neoscrypt_hash_init_blake256(blake256_hash_state *S) { + S->H[0] = 0x6a09e667ULL; + S->H[1] = 0xbb67ae85ULL; + S->H[2] = 0x3c6ef372ULL; + S->H[3] = 0xa54ff53aULL; + S->H[4] = 0x510e527fULL; + S->H[5] = 0x9b05688cULL; + S->H[6] = 0x1f83d9abULL; + S->H[7] = 0x5be0cd19ULL; + S->T[0] = 0; + S->T[1] = 0; + S->leftover = 0; +} + +static void neoscrypt_hash_update_blake256(blake256_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if(S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if(S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + blake256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if(blocks) { + blake256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if(S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void neoscrypt_hash_finish_blake256(blake256_hash_state *S, uint8_t *hash) { + uint32_t th, tl, bits; + + bits = (S->leftover << 3); + tl = S->T[0] + bits; + th = S->T[1]; + if(S->leftover == 0) { + S->T[0] = (uint32_t)0 - (uint32_t)512; + S->T[1] = (uint32_t)0 - (uint32_t)1; + } else if(S->T[0] == 0) { + S->T[0] = ((uint32_t)0 - (uint32_t)512) + bits; + S->T[1] = S->T[1] - 1; + } else { + S->T[0] -= (512 - bits); + } + + S->buffer[S->leftover] = 0x80; + if(S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + blake256_blocks(S, S->buffer, 1); + S->T[0] = (uint32_t)0 - (uint32_t)512; + S->T[1] = (uint32_t)0 - (uint32_t)1; + memset(S->buffer, 0, 56); + } + S->buffer[55] |= 1; + U32TO8_BE(S->buffer + 56, th); + U32TO8_BE(S->buffer + 60, tl); + blake256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static void neoscrypt_hash_blake256(hash_digest hash, const uint8_t *m, size_t mlen) { + blake256_hash_state st; + neoscrypt_hash_init_blake256(&st); + neoscrypt_hash_update_blake256(&st, m, mlen); + neoscrypt_hash_finish_blake256(&st, hash); +} + + +/* HMAC for BLAKE-256 */ + +typedef struct blake256_hmac_state_t { + blake256_hash_state inner, outer; +} blake256_hmac_state; + +static void neoscrypt_hmac_init_blake256(blake256_hmac_state *st, const uint8_t *key, size_t keylen) { + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + neoscrypt_hash_init_blake256(&st->inner); + neoscrypt_hash_init_blake256(&st->outer); + + if(keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + neoscrypt_hash_blake256(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + neoscrypt_hash_update_blake256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for(i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + neoscrypt_hash_update_blake256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); +} + +static void neoscrypt_hmac_update_blake256(blake256_hmac_state *st, const uint8_t *m, size_t mlen) { + /* h(inner || m...) */ + neoscrypt_hash_update_blake256(&st->inner, m, mlen); +} + +static void neoscrypt_hmac_finish_blake256(blake256_hmac_state *st, hash_digest mac) { + /* h(inner || m) */ + hash_digest innerhash; + neoscrypt_hash_finish_blake256(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + neoscrypt_hash_update_blake256(&st->outer, innerhash, sizeof(innerhash)); + neoscrypt_hash_finish_blake256(&st->outer, mac); +} + + +/* PBKDF2 for BLAKE-256 */ + +static void neoscrypt_pbkdf2_blake256(const uint8_t *password, size_t password_len, + const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len) { + blake256_hmac_state hmac_pw, hmac_pw_salt, work; + hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, k, blocks; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + neoscrypt_hmac_init_blake256(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + neoscrypt_hmac_update_blake256(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for(i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + neoscrypt_hmac_update_blake256(&work, be, 4); + neoscrypt_hmac_finish_blake256(&work, ti); + memcpy(u, ti, sizeof(u)); + + /* T[i] = U1 ^ U2 ^ U3... */ + for(j = 0; j < N - 1; j++) { + /* UX = hmac(password, U{X-1}) */ + work = hmac_pw; + neoscrypt_hmac_update_blake256(&work, u, SCRYPT_HASH_DIGEST_SIZE); + neoscrypt_hmac_finish_blake256(&work, u); + + /* T[i] ^= UX */ + for(k = 0; k < sizeof(u); k++) + ti[k] ^= u[k]; + } + + memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len); + output += SCRYPT_HASH_DIGEST_SIZE; + output_len -= SCRYPT_HASH_DIGEST_SIZE; + } +} + +#endif + + +/* NeoScrypt */ + +#if defined(ASM) + +extern void neoscrypt_salsa(uint *X, uint rounds); +extern void neoscrypt_salsa_tangle(uint *X, uint count); +extern void neoscrypt_chacha(uint *X, uint rounds); + +extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len); +extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len); +extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len); + +#else + +/* Salsa20, rounds must be a multiple of 2 */ +static void neoscrypt_salsa(uint *X, uint rounds) { + uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; + + x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; + x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; + x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; + x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; + +#define quarter(a, b, c, d) \ + t = a + d; t = ROTL32(t, 7); b ^= t; \ + t = b + a; t = ROTL32(t, 9); c ^= t; \ + t = c + b; t = ROTL32(t, 13); d ^= t; \ + t = d + c; t = ROTL32(t, 18); a ^= t; + + for(; rounds; rounds -= 2) { + quarter( x0, x4, x8, x12); + quarter( x5, x9, x13, x1); + quarter(x10, x14, x2, x6); + quarter(x15, x3, x7, x11); + quarter( x0, x1, x2, x3); + quarter( x5, x6, x7, x4); + quarter(x10, x11, x8, x9); + quarter(x15, x12, x13, x14); + } + + X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; + X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; + X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; + X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; + +#undef quarter +} + +/* ChaCha20, rounds must be a multiple of 2 */ +static void neoscrypt_chacha(uint *X, uint rounds) { + uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t; + + x0 = X[0]; x1 = X[1]; x2 = X[2]; x3 = X[3]; + x4 = X[4]; x5 = X[5]; x6 = X[6]; x7 = X[7]; + x8 = X[8]; x9 = X[9]; x10 = X[10]; x11 = X[11]; + x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15]; + +#define quarter(a,b,c,d) \ + a += b; t = d ^ a; d = ROTL32(t, 16); \ + c += d; t = b ^ c; b = ROTL32(t, 12); \ + a += b; t = d ^ a; d = ROTL32(t, 8); \ + c += d; t = b ^ c; b = ROTL32(t, 7); + + for(; rounds; rounds -= 2) { + quarter( x0, x4, x8, x12); + quarter( x1, x5, x9, x13); + quarter( x2, x6, x10, x14); + quarter( x3, x7, x11, x15); + quarter( x0, x5, x10, x15); + quarter( x1, x6, x11, x12); + quarter( x2, x7, x8, x13); + quarter( x3, x4, x9, x14); + } + + X[0] += x0; X[1] += x1; X[2] += x2; X[3] += x3; + X[4] += x4; X[5] += x5; X[6] += x6; X[7] += x7; + X[8] += x8; X[9] += x9; X[10] += x10; X[11] += x11; + X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15; + +#undef quarter +} + + +/* Fast 32-bit / 64-bit memcpy(); + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i; + + for(i = 0; i < (len / sizeof(ulong)); i += 4) { + dst[i] = src[i]; + dst[i + 1] = src[i + 1]; + dst[i + 2] = src[i + 2]; + dst[i + 3] = src[i + 3]; + } +} + +/* Fast 32-bit / 64-bit block swapper; + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { + ulong *blkA = (ulong *) blkAp; + ulong *blkB = (ulong *) blkBp; + register ulong t0, t1, t2, t3; + uint i; + + for(i = 0; i < (len / sizeof(ulong)); i += 4) { + t0 = blkA[i]; + t1 = blkA[i + 1]; + t2 = blkA[i + 2]; + t3 = blkA[i + 3]; + blkA[i] = blkB[i]; + blkA[i + 1] = blkB[i + 1]; + blkA[i + 2] = blkB[i + 2]; + blkA[i + 3] = blkB[i + 3]; + blkB[i] = t0; + blkB[i + 1] = t1; + blkB[i + 2] = t2; + blkB[i + 3] = t3; + } +} + +/* Fast 32-bit / 64-bit block XOR engine; + * len must be a multiple of 32 bytes */ +static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i; + + for(i = 0; i < (len / sizeof(ulong)); i += 4) { + dst[i] ^= src[i]; + dst[i + 1] ^= src[i + 1]; + dst[i + 2] ^= src[i + 2]; + dst[i + 3] ^= src[i + 3]; + } +} + +#endif + +/* 32-bit / 64-bit optimised memcpy() */ +static void neoscrypt_copy(void *dstp, const void *srcp, uint len) { + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i, tail; + + for(i = 0; i < (len / sizeof(ulong)); i++) + dst[i] = src[i]; + + tail = len & (sizeof(ulong) - 1); + if(tail) { + uchar *dstb = (uchar *) dstp; + uchar *srcb = (uchar *) srcp; + + for(i = len - tail; i < len; i++) + dstb[i] = srcb[i]; + } +} + +/* 32-bit / 64-bit optimised memory erase aka memset() to zero */ +static void neoscrypt_erase(void *dstp, uint len) { + const ulong null = 0; + ulong *dst = (ulong *) dstp; + uint i, tail; + + for(i = 0; i < (len / sizeof(ulong)); i++) + dst[i] = null; + + tail = len & (sizeof(ulong) - 1); + if(tail) { + uchar *dstb = (uchar *) dstp; + + for(i = len - tail; i < len; i++) + dstb[i] = (uchar)null; + } +} + +/* 32-bit / 64-bit optimised XOR engine */ +static void neoscrypt_xor(void *dstp, const void *srcp, uint len) { + ulong *dst = (ulong *) dstp; + ulong *src = (ulong *) srcp; + uint i, tail; + + for(i = 0; i < (len / sizeof(ulong)); i++) + dst[i] ^= src[i]; + + tail = len & (sizeof(ulong) - 1); + if(tail) { + uchar *dstb = (uchar *) dstp; + uchar *srcb = (uchar *) srcp; + + for(i = len - tail; i < len; i++) + dstb[i] ^= srcb[i]; + } +} + + +/* BLAKE2s */ + +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U +#define BLAKE2S_KEY_SIZE 32U + +/* Parameter block of 32 bytes */ +typedef struct blake2s_param_t { + uchar digest_length; + uchar key_length; + uchar fanout; + uchar depth; + uint leaf_length; + uchar node_offset[6]; + uchar node_depth; + uchar inner_length; + uchar salt[8]; + uchar personal[8]; +} blake2s_param; + +/* State block of 180 bytes */ +typedef struct blake2s_state_t { + uint h[8]; + uint t[2]; + uint f[2]; + uchar buf[2 * BLAKE2S_BLOCK_SIZE]; + uint buflen; +} blake2s_state; + +static const uint blake2s_IV[8] = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static const uint8_t blake2s_sigma[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , +}; + +static void blake2s_compress(blake2s_state *S, const uint *buf) { + uint i; + uint m[16]; + uint v[16]; + + neoscrypt_copy(m, buf, 64); + neoscrypt_copy(v, S, 32); + + v[ 8] = blake2s_IV[0]; + v[ 9] = blake2s_IV[1]; + v[10] = blake2s_IV[2]; + v[11] = blake2s_IV[3]; + v[12] = S->t[0] ^ blake2s_IV[4]; + v[13] = S->t[1] ^ blake2s_IV[5]; + v[14] = S->f[0] ^ blake2s_IV[6]; + v[15] = S->f[1] ^ blake2s_IV[7]; +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2s_sigma[r][2*i+0]]; \ + d = ROTR32(d ^ a, 16); \ + c = c + d; \ + b = ROTR32(b ^ c, 12); \ + a = a + b + m[blake2s_sigma[r][2*i+1]]; \ + d = ROTR32(d ^ a, 8); \ + c = c + d; \ + b = ROTR32(b ^ c, 7); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \ + G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \ + G(r, 2, v[ 2], v[ 6], v[10], v[14]); \ + G(r, 3, v[ 3], v[ 7], v[11], v[15]); \ + G(r, 4, v[ 0], v[ 5], v[10], v[15]); \ + G(r, 5, v[ 1], v[ 6], v[11], v[12]); \ + G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ + G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ + } while(0) + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + + for(i = 0; i < 8; i++) + S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; + +#undef G +#undef ROUND +} + +static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) { + uint left, fill; + + while(input_size > 0) { + left = S->buflen; + fill = 2 * BLAKE2S_BLOCK_SIZE - left; + if(input_size > fill) { + /* Buffer fill */ + neoscrypt_copy(S->buf + left, input, fill); + S->buflen += fill; + /* Counter increment */ + S->t[0] += BLAKE2S_BLOCK_SIZE; + /* Compress */ + blake2s_compress(S, (uint *) S->buf); + /* Shift buffer left */ + neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE); + S->buflen -= BLAKE2S_BLOCK_SIZE; + input += fill; + input_size -= fill; + } else { + neoscrypt_copy(S->buf + left, input, input_size); + S->buflen += input_size; + /* Do not compress */ + input += input_size; + input_size = 0; + } + } +} + +static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size, + void *output, const uchar output_size) { + uchar block[BLAKE2S_BLOCK_SIZE]; + blake2s_param P[1]; + blake2s_state S[1]; + + /* Initialise */ + neoscrypt_erase(P, 32); + P->digest_length = output_size; + P->key_length = key_size; + P->fanout = 1; + P->depth = 1; + + neoscrypt_erase(S, 180); + neoscrypt_copy(S, blake2s_IV, 32); + neoscrypt_xor(S, P, 32); + + neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); + neoscrypt_copy(block, key, key_size); + blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE); + + /* Update */ + blake2s_update(S, (uchar *) input, input_size); + + /* Finish */ + if(S->buflen > BLAKE2S_BLOCK_SIZE) { + S->t[0] += BLAKE2S_BLOCK_SIZE; + blake2s_compress(S, (uint *) S->buf); + S->buflen -= BLAKE2S_BLOCK_SIZE; + neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen); + } + S->t[0] += S->buflen; + S->f[0] = ~0U; + neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen); + blake2s_compress(S, (uint *) S->buf); + /* Write back */ + neoscrypt_copy(output, S, output_size); +} + + +#define FASTKDF_BUFFER_SIZE 256U + +/* FastKDF, a fast buffered key derivation function: + * FASTKDF_BUFFER_SIZE must be a power of 2; + * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; + * prf_output_size must be <= prf_key_size; */ +static void neoscrypt_fastkdf(const uchar *password, uint password_len, + const uchar *salt, uint salt_len, + uint N, uchar *output, uint output_len) { + const uint stack_align = 0x40, kdf_buf_size = FASTKDF_BUFFER_SIZE, + prf_input_size = BLAKE2S_BLOCK_SIZE, prf_key_size = BLAKE2S_KEY_SIZE, + prf_output_size = BLAKE2S_OUT_SIZE; + uint bufptr, a, b, i, j; + uchar *A, *B, *prf_input, *prf_key, *prf_output; + + /* Align and set up the buffers in stack */ + uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align]; + A = &stack[stack_align & ~(stack_align - 1)]; + B = &A[kdf_buf_size + prf_input_size]; + prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size]; + + /* Initialise the password buffer */ + if(password_len > kdf_buf_size) + password_len = kdf_buf_size; + + a = kdf_buf_size / password_len; + for(i = 0; i < a; i++) + neoscrypt_copy(&A[i * password_len], &password[0], password_len); + b = kdf_buf_size - a * password_len; + if(b) + neoscrypt_copy(&A[a * password_len], &password[0], b); + neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size); + + /* Initialise the salt buffer */ + if(salt_len > kdf_buf_size) + salt_len = kdf_buf_size; + + a = kdf_buf_size / salt_len; + for(i = 0; i < a; i++) + neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len); + b = kdf_buf_size - a * salt_len; + if(b) + neoscrypt_copy(&B[a * salt_len], &salt[0], b); + neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size); + + /* The primary iteration */ + for(i = 0, bufptr = 0; i < N; i++) { + + /* Map the PRF input buffer */ + prf_input = &A[bufptr]; + + /* Map the PRF key buffer */ + prf_key = &B[bufptr]; + + /* PRF */ + neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size); + + /* Calculate the next buffer pointer */ + for(j = 0, bufptr = 0; j < prf_output_size; j++) + bufptr += prf_output[j]; + bufptr &= (kdf_buf_size - 1); + /* Modify the salt buffer */ + neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size); + + /* Head modified, tail updated */ + if(bufptr < prf_key_size) + neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr)); + + /* Tail modified, head updated */ + if((kdf_buf_size - bufptr) < prf_output_size) + neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr)); + } + + /* Modify and copy into the output buffer */ + if(output_len > kdf_buf_size) + output_len = kdf_buf_size; + + a = kdf_buf_size - bufptr; + if(a >= output_len) { + neoscrypt_xor(&B[bufptr], &A[0], output_len); + neoscrypt_copy(&output[0], &B[bufptr], output_len); + } else { + neoscrypt_xor(&B[bufptr], &A[0], a); + neoscrypt_xor(&B[0], &A[a], output_len - a); + neoscrypt_copy(&output[0], &B[bufptr], a); + neoscrypt_copy(&output[a], &B[0], output_len - a); + } + +} + + +/* Configurable optimised block mixer */ +static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode) { + uint i, mixer, rounds; + + mixer = mixmode >> 8; + rounds = mixmode & 0xFF; + + /* NeoScrypt flow: Scrypt flow: + Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; + Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; + Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; + Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; + Xa" = Ya; Xb" = Yc; + Xc" = Yb; Xd" = Yd; */ + + if(r == 1) { + neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[0], rounds); + else + neoscrypt_salsa(&X[0], rounds); + neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16], rounds); + else + neoscrypt_salsa(&X[16], rounds); + return; + } + + if(r == 2) { + neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[0], rounds); + else + neoscrypt_salsa(&X[0], rounds); + neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16], rounds); + else + neoscrypt_salsa(&X[16], rounds); + neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[32], rounds); + else + neoscrypt_salsa(&X[32], rounds); + neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[48], rounds); + else + neoscrypt_salsa(&X[48], rounds); + neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE); + return; + } + + /* Reference code for any reasonable r */ + for(i = 0; i < 2 * r; i++) { + if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE); + else neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE); + if(mixer) + neoscrypt_chacha(&X[16 * i], rounds); + else + neoscrypt_salsa(&X[16 * i], rounds); + neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE); + } + for(i = 0; i < r; i++) + neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE); + for(i = 0; i < r; i++) + neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE); +} + +/* NeoScrypt core engine: + * p = 1, salt = password; + * Basic customisation (required): + * profile bit 0: + * 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; + * 1 = Scrypt(1024, 1, 1) with Salsa20/8; + * profile bits 4 to 1: + * 0000 = FastKDF-BLAKE2s; + * 0001 = PBKDF2-HMAC-SHA256; + * 0010 = PBKDF2-HMAC-BLAKE256; + * Extended customisation (optional): + * profile bit 31: + * 0 = extended customisation absent; + * 1 = extended customisation present; + * profile bits 7 to 5 (rfactor): + * 000 = r of 1; + * 001 = r of 2; + * 010 = r of 4; + * ... + * 111 = r of 128; + * profile bits 12 to 8 (Nfactor): + * 00000 = N of 2; + * 00001 = N of 4; + * 00010 = N of 8; + * ..... + * 00110 = N of 128; + * ..... + * 01001 = N of 1024; + * ..... + * 11110 = N of 2147483648; + * profile bits 30 to 13 are reserved */ +void neoscrypt(const uchar *password, uchar *output, uint profile) { + uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40; + uint kdf, i, j; + uint *X, *Y, *Z, *V; + + if(profile & 0x1) { + N = 1024; /* N = (1 << (Nfactor + 1)); */ + r = 1; /* r = (1 << rfactor); */ + dblmix = 0; /* Salsa only */ + mixmode = 0x08; /* 8 rounds */ + } + + if(profile >> 31) { + N = (1 << (((profile >> 8) & 0x1F) + 1)); + r = (1 << ((profile >> 5) & 0x7)); + } + + uchar *stack; + stack =(uchar*)malloc((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align); + + /* X = r * 2 * SCRYPT_BLOCK_SIZE */ + X = (uint *) &stack[stack_align & ~(stack_align - 1)]; + /* Z is a copy of X for ChaCha */ + Z = &X[32 * r]; + /* Y is an X sized temporal space */ + Y = &X[64 * r]; + /* V = N * r * 2 * SCRYPT_BLOCK_SIZE */ + V = &X[96 * r]; + + /* X = KDF(password, salt) */ + kdf = (profile >> 1) & 0xF; + + switch(kdf) { + + default: + case(0x0): + neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + break; + +#if (SHA256) + case(0x1): + neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + break; +#endif + +#if (BLAKE256) + case(0x2): + neoscrypt_pbkdf2_blake256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE); + break; +#endif + + } + + /* Process ChaCha 1st, Salsa 2nd and XOR them into PBKDF2; otherwise Salsa only */ + + if(dblmix) { + /* blkcpy(Z, X) */ + neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + + /* Z = SMix(Z) */ + for(i = 0; i < N; i++) { + /* blkcpy(V, Z) */ + neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(Z, Y) */ + neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); + } + for(i = 0; i < N; i++) { + /* integerify(Z) mod N */ + j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1)); + /* blkxor(Z, V) */ + neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(Z, Y) */ + neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100)); + } + } + +#if (ASM) + /* Must be called before and after SSE2 Salsa */ + neoscrypt_salsa_tangle(&X[0], r * 2); +#endif + + /* X = SMix(X) */ + for(i = 0; i < N; i++) { + /* blkcpy(V, X) */ + neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(X, Y) */ + neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); + } + for(i = 0; i < N; i++) { + /* integerify(X) mod N */ + j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1)); + /* blkxor(X, V) */ + neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE); + /* blkmix(X, Y) */ + neoscrypt_blkmix(&X[0], &Y[0], r, mixmode); + } + +#if (ASM) + neoscrypt_salsa_tangle(&X[0], r * 2); +#endif + + if(dblmix) + /* blkxor(X, Z) */ + neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE); + + /* output = KDF(password, X) */ + switch(kdf) { + + default: + case(0x0): + neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32); + break; + +#if (SHA256) + case(0x1): + neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32); + break; +#endif + +#if (BLAKE256) + case(0x2): + neoscrypt_pbkdf2_blake256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32); + break; +#endif + + } + + free(stack); +} + +void neoscrypt_regenhash(struct work *work) +{ + neoscrypt(work->data, work->hash, 0x80000620); +} + +#if (NEOSCRYPT_TEST) + +#include + +int main() { + uint prf_input_len = 64, prf_key_len = 32, prf_output_len = 32; + uint kdf_input_len = 80, kdf_output_len = 256, N = 32; + uint neoscrypt_output_len = 32; + uchar input[kdf_input_len], output[kdf_output_len]; + uint i; + ubool fail; + + for(i = 0; i < kdf_input_len; i++) { + input[i] = i; + } + + neoscrypt_blake2s(input, prf_input_len, input, prf_key_len, output, prf_output_len); + + uchar blake2s_ref[32] = { + 0x89, 0x75, 0xB0, 0x57, 0x7F, 0xD3, 0x55, 0x66, + 0xD7, 0x50, 0xB3, 0x62, 0xB0, 0x89, 0x7A, 0x26, + 0xC3, 0x99, 0x13, 0x6D, 0xF0, 0x7B, 0xAB, 0xAB, + 0xBD, 0xE6, 0x20, 0x3F, 0xF2, 0x95, 0x4E, 0xD4 }; + + for(i = 0, fail = 0; i < prf_output_len; i++) { + if(output[i] != blake2s_ref[i]) { + fail = 1; + break; + } + } + + if(fail) { + printf("BLAKE2s integrity test failed!\n"); + return(1); + } else { + printf("BLAKE2s integrity test passed.\n"); + } + + neoscrypt_fastkdf(input, kdf_input_len, input, kdf_input_len, N, output, kdf_output_len); + + uchar fastkdf_ref[256] = { + 0xCC, 0xBC, 0x19, 0x71, 0xEC, 0x44, 0xE3, 0x17, + 0xB3, 0xC9, 0xDE, 0x16, 0x76, 0x02, 0x60, 0xB8, + 0xE2, 0xD4, 0x79, 0xB6, 0x88, 0xCA, 0xB5, 0x4A, + 0xCF, 0x6E, 0x0E, 0x9A, 0xAE, 0x48, 0x78, 0x12, + 0xA1, 0x95, 0x1E, 0xE1, 0xD1, 0x0A, 0xC2, 0x94, + 0x1F, 0x0A, 0x39, 0x73, 0xFE, 0xA4, 0xCD, 0x87, + 0x4B, 0x38, 0x54, 0x72, 0xB5, 0x53, 0xC3, 0xEA, + 0xC1, 0x26, 0x8D, 0xA7, 0xFF, 0x3F, 0xC1, 0x79, + 0xA6, 0xFF, 0x96, 0x54, 0x29, 0x05, 0xC0, 0x22, + 0x90, 0xDB, 0x53, 0x87, 0x2D, 0x29, 0x00, 0xA6, + 0x14, 0x16, 0x38, 0x63, 0xDA, 0xBC, 0x0E, 0x99, + 0x68, 0xB3, 0x98, 0x92, 0x42, 0xE3, 0xF6, 0xB4, + 0x19, 0xE3, 0xE3, 0xF6, 0x8E, 0x67, 0x47, 0x7B, + 0xB6, 0xFB, 0xEA, 0xCE, 0x6D, 0x0F, 0xAF, 0xF6, + 0x19, 0x43, 0x8D, 0xF7, 0x3E, 0xB5, 0xFB, 0xA3, + 0x64, 0x5E, 0xD2, 0x72, 0x80, 0x6B, 0x39, 0x93, + 0xB7, 0x80, 0x04, 0xCB, 0xF5, 0xC2, 0x61, 0xB1, + 0x90, 0x4E, 0x2B, 0x02, 0x57, 0x53, 0x77, 0x16, + 0x6A, 0x52, 0xBD, 0xD1, 0x62, 0xEC, 0xA1, 0xCB, + 0x89, 0x03, 0x29, 0xA2, 0x02, 0x5C, 0x9A, 0x62, + 0x99, 0x44, 0x54, 0xEA, 0x44, 0x91, 0x27, 0x3A, + 0x50, 0x82, 0x62, 0x03, 0x99, 0xB3, 0xFA, 0xF7, + 0xD4, 0x13, 0x47, 0x61, 0xFB, 0x0A, 0xE7, 0x81, + 0x61, 0x57, 0x58, 0x4C, 0x69, 0x4E, 0x67, 0x0A, + 0xC1, 0x21, 0xA7, 0xD2, 0xF6, 0x6D, 0x2F, 0x10, + 0x01, 0xFB, 0xA5, 0x47, 0x2C, 0xE5, 0x15, 0xD7, + 0x6A, 0xEF, 0xC9, 0xE2, 0xC2, 0x88, 0xA2, 0x3B, + 0x6C, 0x8D, 0xBB, 0x26, 0xE7, 0xC4, 0x15, 0xEC, + 0x5E, 0x5D, 0x74, 0x79, 0xBD, 0x81, 0x35, 0xA1, + 0x42, 0x27, 0xEB, 0x57, 0xCF, 0xF6, 0x2E, 0x51, + 0x90, 0xFD, 0xD9, 0xE4, 0x53, 0x6E, 0x12, 0xA1, + 0x99, 0x79, 0x4D, 0x29, 0x6F, 0x5B, 0x4D, 0x9A }; + + for(i = 0, fail = 0; i < kdf_output_len; i++) { + if(output[i] != fastkdf_ref[i]) { + fail = 1; + break; + } + } + + if(fail) { + printf("FastKDF integrity test failed!\n"); + return(1); + } else { + printf("FastKDF integrity test passed.\n"); + } + + neoscrypt(input, output, 0x80000620); + + uchar neoscrypt_ref[32] = { + 0x72, 0x58, 0x96, 0x1A, 0xFB, 0x33, 0xFD, 0x12, + 0xD0, 0x0C, 0xAC, 0xB8, 0xD6, 0x3F, 0x4F, 0x4F, + 0x52, 0xBB, 0x69, 0x17, 0x04, 0x38, 0x65, 0xDD, + 0x24, 0xA0, 0x8F, 0x57, 0x88, 0x53, 0x12, 0x2D }; + + for(i = 0, fail = 0; i < neoscrypt_output_len; i++) { + if(output[i] != neoscrypt_ref[i]) { + fail = 1; + break; + } + } + + if(fail) { + printf("NeoScrypt integrity test failed!\n"); + return(1); + } else { + printf("NeoScrypt integrity test passed.\n"); + } + + return(0); +} + +#endif \ No newline at end of file diff --git a/algorithm/neoscrypt.h b/algorithm/neoscrypt.h new file mode 100644 index 00000000..5337dac8 --- /dev/null +++ b/algorithm/neoscrypt.h @@ -0,0 +1,13 @@ +#ifndef NEOSCRYPT_H +#define NEOSCRYPT_H + +#include "miner.h" + +/* The neoscrypt scratch buffer needs 32kBytes memory. */ +#define NEOSCRYPT_SCRATCHBUF_SIZE (32 * 1024) + +/* These routines are always available. */ +extern void neoscrypt_regenhash(struct work *work); +extern void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile); + +#endif /* NEOSCRYPT_H */ \ No newline at end of file diff --git a/kernel/neoscrypt.cl b/kernel/neoscrypt.cl new file mode 100644 index 00000000..7939d7ed --- /dev/null +++ b/kernel/neoscrypt.cl @@ -0,0 +1,525 @@ +/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */ +/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */ + +// Stupid AMD compiler ignores the unroll pragma in these two +#define SALSA_SMALL_UNROLL 3 +#define CHACHA_SMALL_UNROLL 3 + +// If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted +// as the unroll factor; must divide cleanly into ten. +// Usually a bad idea. +//#define SMALL_BLAKE2S +//#define BLAKE2S_UNROLL 5 + +#define BLOCK_SIZE 64U +#define FASTKDF_BUFFER_SIZE 256U +#ifndef PASSWORD_LEN +#define PASSWORD_LEN 80U +#endif + +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif + +// Swaps 128 bytes at a time without using temp vars +void SwapBytes128(void *restrict A, void *restrict B, uint len) +{ + #pragma unroll 2 + for(int i = 0; i < (len >> 7); ++i) + { + ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i]; + ((ulong16 *)B)[i] ^= ((ulong16 *)A)[i]; + ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i]; + } +} + +void CopyBytes128(void *restrict dst, const void *restrict src, uint len) +{ + #pragma unroll 2 + for(int i = 0; i < len; ++i) + ((ulong16 *)dst)[i] = ((ulong16 *)src)[i]; +} + +void CopyBytes(void *restrict dst, const void *restrict src, uint len) +{ + for(int i = 0; i < len; ++i) + ((uchar *)dst)[i] = ((uchar *)src)[i]; +} + +void XORBytesInPlace(void *restrict dst, const void *restrict src, uint len) +{ + for(int i = 0; i < len; ++i) + ((uchar *)dst)[i] ^= ((uchar *)src)[i]; +} + +void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len) +{ + #pragma unroll 1 + for(int i = 0; i < len; ++i) + ((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i]; +} + +// Blake2S + +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U +#define BLAKE2S_KEY_SIZE 32U + +static const __constant uint BLAKE2S_IV[8] = +{ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, + 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 +}; + +static const __constant uchar BLAKE2S_SIGMA[10][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , +}; + +#define BLAKE_G(idx0, idx1, a, b, c, d, key) do { \ + a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \ + d = rotate(d ^ a, 16U); \ + c += d; \ + b = rotate(b ^ c, 20U); \ + a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \ + d = rotate(d ^ a, 24U); \ + c += d; \ + b = rotate(b ^ c, 25U); \ +} while(0) + +void Blake2S(uint *restrict inout, const uint *restrict inkey) +{ + uint16 V; + uint8 tmpblock; + + // Load first block (IV into V.lo) and constants (IV into V.hi) + V.lo = V.hi = vload8(0U, BLAKE2S_IV); + + // XOR with initial constant + V.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message so far (including this block) + // There are two uints for this field, but high uint is zero + V.sc ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #ifdef SMALL_BLAKE2S + #pragma unroll BLAKE2S_UNROLL + #else + #pragma unroll + #endif + for(int x = 0; x < 10; ++x) + { + BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey); + BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey); + BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey); + BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey); + BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey); + BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey); + BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey); + BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey); + } + + // XOR low part of state with the high part, + // then with the original input block. + V.lo ^= V.hi ^ tmpblock; + + // Load constants (IV into V.hi) + V.hi = vload8(0U, BLAKE2S_IV); + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message into block again + V.sc ^= BLAKE2S_BLOCK_SIZE << 1; + + // Last block compression - XOR final constant into state + V.se ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #ifdef SMALL_BLAKE2S + #pragma unroll BLAKE2S_UNROLL + #else + #pragma unroll + #endif + for(int x = 0; x < 10; ++x) + { + BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout); + BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout); + BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout); + BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout); + BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout); + BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout); + BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout); + BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout); + } + + // XOR low part of state with high part, then with input block + V.lo ^= V.hi ^ tmpblock; + + // Store result in input/output buffer + vstore8(V.lo, 0, inout); +} + +/* FastKDF, a fast buffered key derivation function: + * FASTKDF_BUFFER_SIZE must be a power of 2; + * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; + * prf_output_size must be <= prf_key_size; */ +void fastkdf(const uchar *restrict password, const uchar *restrict salt, const uint salt_len, uchar *restrict output, uint output_len) +{ + + /* WARNING! + * This algorithm uses byte-wise addressing for memory blocks. + * Or in other words, trying to copy an unaligned memory region + * will significantly slow down the algorithm, when copying uses + * words or bigger entities. It even may corrupt the data, when + * the device does not support it properly. + * Therefore use byte copying, which will not the fastest but at + * least get reliable results. */ + + // BLOCK_SIZE 64U + // FASTKDF_BUFFER_SIZE 256U + // BLAKE2S_BLOCK_SIZE 64U + // BLAKE2S_KEY_SIZE 32U + // BLAKE2S_OUT_SIZE 32U + uchar bufidx = 0; + uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) }; + uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer; + + // Initialize the password buffer + #pragma unroll 1 + for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10]; + + ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0]; + + // Initialize the salt buffer + if(salt_len == FASTKDF_BUFFER_SIZE) + { + ((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0]; + ((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1]; + } + else + { + // salt_len is 80 bytes here + #pragma unroll 1 + for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)salt)[i % 10]; + + // Initialized the rest to zero earlier + #pragma unroll 1 + for(int i = 0; i < 10; ++i) ((ulong *)(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong *)salt)[i]; + } + + // The primary iteration + #pragma unroll 1 + for(int i = 0; i < 32; ++i) + { + // Make the key buffer twice the size of the key so it fits a Blake2S block + // This way, we don't need a temp buffer in the Blake2S function. + uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 }; + + // Copy input and key to their buffers + CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); + CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); + + // PRF + Blake2S((uint *)input, (uint *)key); + + // Calculate the next buffer pointer + bufidx = 0; + + for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x) + bufidx += input[x]; + + // bufidx a uchar now - always mod 255 + //bufidx &= (FASTKDF_BUFFER_SIZE - 1); + + // Modify the salt buffer + XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE); + + if(bufidx < BLAKE2S_KEY_SIZE) + { + // Head modified, tail updated + // this was made off the original code... wtf + //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); + CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx); + } + else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) + { + // Tail modified, head updated + CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); + } + } + + // Modify and copy into the output buffer + + // Damned compiler crashes + // Fuck you, AMD + + //for(uint i = 0; i < output_len; ++i, ++bufidx) + // output[i] = B[bufidx] ^ A[i]; + + uint left = FASTKDF_BUFFER_SIZE - bufidx; + //uint left = (~bufidx) + 1 + + if(left < output_len) + { + XORBytes(output, B + bufidx, A, left); + XORBytes(output + left, B, A + left, output_len - left); + } + else + { + XORBytes(output, B + bufidx, A, output_len); + } +} + +#define SALSA_CORE(state) do { \ + state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \ + state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \ + state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \ + state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \ + state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \ + state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \ + state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \ + state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \ +} while(0) + +uint16 salsa_small_scalar_rnd(uint16 X) +{ + uint16 st = X; + + #if SALSA_SMALL_UNROLL == 1 + + for(int i = 0; i < 10; ++i) + { + SALSA_CORE(st); + } + + #elif SALSA_SMALL_UNROLL == 2 + + for(int i = 0; i < 5; ++i) + { + SALSA_CORE(st); + SALSA_CORE(st); + } + + #elif SALSA_SMALL_UNROLL == 3 + + for(int i = 0; i < 4; ++i) + { + SALSA_CORE(st); + if(i == 3) break; + SALSA_CORE(st); + SALSA_CORE(st); + } + + #elif SALSA_SMALL_UNROLL == 4 + + for(int i = 0; i < 3; ++i) + { + SALSA_CORE(st); + SALSA_CORE(st); + if(i == 2) break; + SALSA_CORE(st); + SALSA_CORE(st); + } + + #else + + for(int i = 0; i < 2; ++i) + { + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + } + + #endif + + return(X + st); +} + +#define CHACHA_CORE_PARALLEL(state) do { \ + state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \ + state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \ + state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \ + state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \ + \ + state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \ + state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \ + state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \ + state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \ +} while(0) + +uint16 chacha_small_parallel_rnd(uint16 X) +{ + uint4 t, st[4]; + + ((uint16 *)st)[0] = X; + + #if CHACHA_SMALL_UNROLL == 1 + + for(int i = 0; i < 10; ++i) + { + CHACHA_CORE_PARALLEL(st); + } + + #elif CHACHA_SMALL_UNROLL == 2 + + for(int i = 0; i < 5; ++i) + { + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + } + + #elif CHACHA_SMALL_UNROLL == 3 + + for(int i = 0; i < 4; ++i) + { + CHACHA_CORE_PARALLEL(st); + if(i == 3) break; + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + } + + #elif CHACHA_SMALL_UNROLL == 4 + + for(int i = 0; i < 3; ++i) + { + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + if(i == 2) break; + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + } + + #else + + for(int i = 0; i < 2; ++i) + { + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + } + + #endif + + return(X + ((uint16 *)st)[0]); +} + +void neoscrypt_blkmix(uint16 *XV, bool alg) +{ + + /* NeoScrypt flow: Scrypt flow: + Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; + Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; + Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; + Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; + Xa" = Ya; Xb" = Yc; + Xc" = Yb; Xd" = Yd; */ + + XV[0] ^= XV[3]; + + if(!alg) + { + XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; + XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; + XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; + XV[3] = salsa_small_scalar_rnd(XV[3]); + } + else + { + XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; + XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1]; + XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2]; + XV[3] = chacha_small_parallel_rnd(XV[3]); + } + + XV[1] ^= XV[2]; + XV[2] ^= XV[1]; + XV[1] ^= XV[2]; +} + +void ScratchpadStore(__global void *V, void *X, uchar idx) +{ + ((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0]; + ((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1]; +} + +void ScratchpadMix(void *X, const __global void *V, uchar idx) +{ + ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1]; + ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1]; +} + +void SMix(uint16 *X, __global uint16 *V, bool flag) +{ + #pragma unroll 1 + for(int i = 0; i < 128; ++i) + { + ScratchpadStore(V, X, i); + neoscrypt_blkmix(X, flag); + } + + #pragma unroll 1 + for(int i = 0; i < 128; ++i) + { + const uint idx = convert_uchar(((uint *)X)[48] & 0x7F); + ScratchpadMix(X, V, idx); + neoscrypt_blkmix(X, flag); + } +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target) +{ +#define CONSTANT_N 128 +#define CONSTANT_r 2 + // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha + uint16 X[4], Z[4]; + /* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */ + __global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS))); + uchar outbuf[32]; + uchar data[PASSWORD_LEN]; + + ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; + ((ulong *)data)[8] = ((__global const ulong *)input)[8]; + ((uint *)data)[18] = ((__global const uint *)input)[18]; + ((uint *)data)[19] = get_global_id(0); + + // X = KDF(password, salt) + fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); + + // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2 + CopyBytes128(Z, X, 2); + + // X = SMix(X); X & Z are swapped, repeat. + for(bool flag = false;; ++flag) + { + SMix(X, V, flag); + if(flag) break; + SwapBytes128(X, Z, 256); + } + + // blkxor(X, Z) + ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0]; + ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1]; + + // output = KDF(password, X) + fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); + if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0); +} \ No newline at end of file diff --git a/miner.h b/miner.h index c8190bd5..0a366e7f 100644 --- a/miner.h +++ b/miner.h @@ -1100,6 +1100,7 @@ extern pthread_cond_t restart_cond; extern void clear_stratum_shares(struct pool *pool); extern void clear_pool_work(struct pool *pool); extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2); +extern void set_target_neoscrypt(unsigned char *target, double diff); extern void kill_work(void); diff --git a/ocl.c b/ocl.c index ea458143..8be317a5 100644 --- a/ocl.c +++ b/ocl.c @@ -34,6 +34,7 @@ #include "ocl.h" #include "ocl/build_kernel.h" #include "ocl/binary_kernel.h" +#include "algorithm/neoscrypt.h" /* FIXME: only here for global config vars, replace with configuration.h * or similar as soon as config is in a struct instead of littered all @@ -344,19 +345,55 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg cgpu->lookup_gap = 2; } - if (!cgpu->opt_tc) { + // neoscrypt calculates TC differently + if (!safe_cmp(cgpu->algorithm.name, "neoscrypt")) { + int max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); + size_t glob_thread_count = 1UL << max_int; + + // if TC is entered by user, use that value... otherwise use default + cgpu->thread_concurrency = ((cgpu->opt_tc) ? cgpu->opt_tc : ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count)); + + // if TC * scratchbuf size is too big for memory... reduce to max + if (((uint64_t)cgpu->thread_concurrency * NEOSCRYPT_SCRATCHBUF_SIZE) >(uint64_t)cgpu->max_alloc) { + /* Selected intensity will not run on this GPU. Not enough memory. + * Adapt the memory setting. */ + glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE; + + /* Find highest significant bit in glob_thread_count, which gives + * the intensity. */ + while (max_int && ((1U << max_int) & glob_thread_count) == 0) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_INTENSITY) { + applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); + max_int = MIN_INTENSITY; + } + + cgpu->intensity = max_int; + cgpu->thread_concurrency = 1U << max_int; + } + + applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); + + } + else if (!cgpu->opt_tc) { unsigned int sixtyfours; sixtyfours = cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1; cgpu->thread_concurrency = sixtyfours * 64; if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; - if (cgpu->thread_concurrency > cgpu->shaders * 5) + if (cgpu->thread_concurrency > cgpu->shaders * 5) { cgpu->thread_concurrency = cgpu->shaders * 5; + } } applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency)); - } else + } + else { cgpu->thread_concurrency = cgpu->opt_tc; + } cl_uint slot, cpnd; @@ -445,17 +482,36 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } size_t bufsize; + size_t readbufsize = 128; if (algorithm->rw_buffer_size < 0) { - size_t ipt = (algorithm->n / cgpu->lookup_gap + - (algorithm->n % cgpu->lookup_gap > 0)); - bufsize = 128 * ipt * cgpu->thread_concurrency; - } else - bufsize = (size_t) algorithm->rw_buffer_size; + // calc buffer size for neoscrypt + if (!safe_cmp(algorithm->name, "neoscrypt")) { + /* The scratch/pad-buffer needs 32kBytes memory per thread. */ + bufsize = NEOSCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency; + + /* This is the input buffer. For neoscrypt this is guaranteed to be + * 80 bytes only. */ + readbufsize = 80; + + applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); + // scrypt/n-scrypt + } + else { + size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0)); + bufsize = 128 * ipt * cgpu->thread_concurrency; + applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); + } + } + else { + bufsize = (size_t)algorithm->rw_buffer_size; + applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); + } clState->padbuffer8 = NULL; if (bufsize > 0) { + applog(LOG_DEBUG, "Creating read/write buffer sized %lu", (unsigned long)bufsize); /* Use the max alloc value which has been rounded to a power of * 2 greater >= required amount earlier */ if (bufsize > cgpu->max_alloc) { @@ -463,7 +519,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg gpu, (unsigned long)(cgpu->max_alloc)); applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize); } - applog(LOG_DEBUG, "Creating buffer sized %lu", (unsigned long)bufsize); /* This buffer is weird and might work to some degree even if * the create buffer call has apparently failed, so check if we @@ -475,11 +530,14 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } } - clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); + applog(LOG_DEBUG, "Using read buffer sized %lu", (unsigned long)readbufsize); + clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, readbufsize, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status); return NULL; } + + applog(LOG_DEBUG, "Using output buffer sized %lu", BUFFERSIZE); clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { diff --git a/sgminer.c b/sgminer.c index 0ad3fe69..202de0b9 100644 --- a/sgminer.c +++ b/sgminer.c @@ -2019,11 +2019,26 @@ static void update_gbt(struct pool *pool) /* Return the work coin/network difficulty */ static double get_work_blockdiff(const struct work *work) { - uint8_t pow = work->data[72]; - int powdiff = (8 * (0x1d - 3)) - (8 * (pow - 3)); - uint32_t diff32 = be32toh(*((uint32_t *)(work->data + 72))) & 0x00FFFFFF; - double numerator = work->pool->algorithm.diff_numerator << powdiff; - return numerator / (double)diff32; + uint64_t diff64; + double numerator; + + // Neoscrypt has the data reversed + if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) { + diff64 = bswap_64(((uint64_t)(be32toh(*((uint32_t *)(work->data + 72))) & 0xFFFFFF00)) << 8); + numerator = (double)work->pool->algorithm.diff_numerator; + } + else { + uint8_t pow = work->data[72]; + int powdiff = (8 * (0x1d - 3)) - (8 * (pow - 3));; + diff64 = be32toh(*((uint32_t *)(work->data + 72))) & 0x0000000000FFFFFF; + numerator = work->pool->algorithm.diff_numerator << powdiff; + } + + if (unlikely(!diff64)) { + diff64 = 1; + } + + return numerator / (double)diff64; } static void gen_gbt_work(struct pool *pool, struct work *work) @@ -2073,7 +2088,10 @@ static void gen_gbt_work(struct pool *pool, struct work *work) free(header); } - calc_midstate(work); + // Neoscrypt doesn't calc_midstate() + if (safe_cmp(pool->algorithm.name, "neoscrypt")) { + calc_midstate(work); + } local_work++; work->pool = pool; work->gbt = true; @@ -2189,10 +2207,15 @@ static bool getwork_decode(json_t *res_val, struct work *work) return false; } - if (!jobj_binary(res_val, "midstate", work->midstate, sizeof(work->midstate), false)) { - // Calculate it ourselves - applog(LOG_DEBUG, "%s: Calculating midstate locally", isnull(get_pool_name(work->pool), "")); - calc_midstate(work); + // Neoscrypt doesn't calc midstate + if (safe_cmp(work->pool->algorithm.name, "neoscrypt")) { + if (!jobj_binary(res_val, "midstate", work->midstate, sizeof(work->midstate), false)) { + // Calculate it ourselves + if (opt_morenotices) { + applog(LOG_DEBUG, "%s: Calculating midstate locally", isnull(get_pool_name(work->pool), "")); + } + calc_midstate(work); + } } if (unlikely(!jobj_binary(res_val, "target", work->target, sizeof(work->target), true))) { @@ -2936,8 +2959,8 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s endian_flip128(work->data, work->data); - /* build hex string */ - hexstr = bin2hex(work->data, sizeof(work->data)); + /* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */ + hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data))); /* build JSON-RPC request */ if (work->gbt) { @@ -3304,11 +3327,19 @@ static void calc_diff(struct work *work, double known) d64 = work->pool->algorithm.diff_multiplier2 * truediffone; - dcut64 = le256todouble(work->target); + applog(LOG_DEBUG, "calc_diff() algorithm = %s", work->pool->algorithm.name); + // Neoscrypt + if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) { + dcut64 = (double)*((uint64_t *)(work->target + 22)); + } + else { + dcut64 = le256todouble(work->target); + } if (unlikely(!dcut64)) dcut64 = 1; work->work_difficulty = d64 / dcut64; } + difficulty = work->work_difficulty; pool_stats->last_diff = difficulty; @@ -5465,8 +5496,21 @@ static void *stratum_sthread(void *userdata) sshare->sshare_time = time(NULL); /* This work item is freed in parse_stratum_response */ sshare->work = work; - nonce = *((uint32_t *)(work->data + 76)); + + applog(LOG_DEBUG, "stratum_sthread() algorithm = %s", pool->algorithm.name); + + // Neoscrypt is little endian + if (!safe_cmp(pool->algorithm.name, "neoscrypt")) { + nonce = htobe32(*((uint32_t *)(work->data + 76))); + //*((uint32_t *)nonce2) = htole32(work->nonce2); + } + else { + nonce = *((uint32_t *)(work->data + 76)); + } __bin2hex(noncehex, (const unsigned char *)&nonce, 4); + + *((uint64_t *)nonce2) = htole64(work->nonce2); + __bin2hex(nonce2hex, nonce2, work->nonce2_len); memset(s, 0, 1024); mutex_lock(&sshare_lock); @@ -5474,10 +5518,6 @@ static void *stratum_sthread(void *userdata) sshare->id = swork_id++; mutex_unlock(&sshare_lock); - nonce2_64 = (uint64_t *)nonce2; - *nonce2_64 = htole64(work->nonce2); - __bin2hex(nonce2hex, nonce2, work->nonce2_len); - snprintf(s, sizeof(s), "{\"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\": %d, \"method\": \"mining.submit\"}", pool->rpc_user, work->job_id, nonce2hex, work->ntime, noncehex, sshare->id); @@ -5885,6 +5925,50 @@ void set_target(unsigned char *dest_target, double diff, double diff_multiplier2 memcpy(dest_target, target, 32); } +/***************************************************** +* Special set_target() function for Neoscrypt +****************************************************/ +void set_target_neoscrypt(unsigned char *target, double diff) +{ + uint64_t m; + int k; + + diff /= 65536.0; + for (k = 6; k > 0 && diff > 1.0; --k) { + diff /= 4294967296.0; + } + + m = 4294901760.0 / diff; + + if (m == 0 && k == 6) { + memset(target, 0xff, 32); + } + else { + memset(target, 0, 32); + ((uint32_t *)target)[k] = (uint32_t)m; + ((uint32_t *)target)[k + 1] = (uint32_t)(m >> 32); + } + + if (opt_debug) { + /* The target is computed in this systems endianess and stored + * in its endianess on a uint32-level. But because the target are + * eight uint32s, they are stored in mixed mode, i.e., each uint32 + * is stored in the local endianess, but the least significant bit + * is stored in target[0] bit 0. + * + * To print this large number in a native human readable form the + * order of the array entries is swapped, i.e., target[7] <-> target[0] + * and each array entry is byte swapped to have the least significant + * bit to the right. */ + uint32_t swaped[8]; + swab256(swaped, target); + char *htarget = bin2hex((unsigned char *)swaped, 32); + + applog(LOG_DEBUG, "Generated neoscrypt target 0x%s", htarget); + free(htarget); + } +} + /* Generates stratum based work based on the most recent notify information * from the pool. This will keep generating work while a pool is down so we use * other means to detect when the pool has died in stratum_thread */ @@ -5893,12 +5977,12 @@ static void gen_stratum_work(struct pool *pool, struct work *work) unsigned char merkle_root[32], merkle_sha[64]; uint32_t *data32, *swap32; uint64_t nonce2le; - int i; + int i, j; cg_wlock(&pool->data_lock); /* Update coinbase. Always use an LE encoded nonce2 to fill in values - * from left to right and prevent overflow errors with small n2sizes */ + * from left to right and prevent overflow errors with small n2sizes */ nonce2le = htole64(pool->nonce2); memcpy(pool->coinbase + pool->nonce2_offset, &nonce2le, pool->n2size); work->nonce2 = pool->nonce2++; @@ -5915,16 +5999,50 @@ static void gen_stratum_work(struct pool *pool, struct work *work) gen_hash(merkle_sha, 64, merkle_root); memcpy(merkle_sha, merkle_root, 32); } - data32 = (uint32_t *)merkle_sha; - swap32 = (uint32_t *)merkle_root; - flip32(swap32, data32); - /* Copy the data template from header_bin */ - memcpy(work->data, pool->header_bin, 128); - memcpy(work->data + pool->merkle_offset, merkle_root, 32); + applog(LOG_DEBUG, "gen_stratum_work() - algorithm = %s", pool->algorithm.name); + + // Different for Neoscrypt because of Little Endian + if (!safe_cmp(pool->algorithm.name, "neoscrypt")) { + /* Incoming data is in little endian. */ + memcpy(merkle_root, merkle_sha, 32); + + uint32_t temp = pool->merkle_offset / sizeof(uint32_t), i; + /* Put version (4 byte) + prev_hash (4 byte* 8) but big endian encoded + * into work. */ + for (i = 0; i < temp; ++i) { + ((uint32_t *)work->data)[i] = be32toh(((uint32_t *)pool->header_bin)[i]); + } + + /* Now add the merkle_root (4 byte* 8), but it is encoded in little endian. */ + temp += 8; + + for (j = 0; i < temp; ++i, ++j) { + ((uint32_t *)work->data)[i] = le32toh(((uint32_t *)merkle_root)[j]); + } + + /* Add the time encoded in big endianess. */ + hex2bin((unsigned char *)&temp, pool->swork.ntime, 4); + + /* Add the nbits (big endianess). */ + ((uint32_t *)work->data)[17] = be32toh(temp); + hex2bin((unsigned char *)&temp, pool->swork.nbit, 4); + ((uint32_t *)work->data)[18] = be32toh(temp); + ((uint32_t *)work->data)[20] = 0x80000000; + ((uint32_t *)work->data)[31] = 0x00000280; + } + else { + data32 = (uint32_t *)merkle_sha; + swap32 = (uint32_t *)merkle_root; + flip32(swap32, data32); + + /* Copy the data template from header_bin */ + memcpy(work->data, pool->header_bin, 128); + memcpy(work->data + pool->merkle_offset, merkle_root, 32); + } /* Store the stratum work diff to check it still matches the pool's - * stratum diff when submitting shares */ + * stratum diff when submitting shares */ work->sdiff = pool->swork.diff; /* Copy parameters required for share submission */ @@ -5941,13 +6059,19 @@ static void gen_stratum_work(struct pool *pool, struct work *work) applog(LOG_DEBUG, "Generated stratum merkle %s", merkle_hash); applog(LOG_DEBUG, "Generated stratum header %s", header); applog(LOG_DEBUG, "Work job_id %s nonce2 %"PRIu64" ntime %s", work->job_id, - work->nonce2, work->ntime); + work->nonce2, work->ntime); free(header); free(merkle_hash); } - calc_midstate(work); - set_target(work->target, work->sdiff, pool->algorithm.diff_multiplier2); + // For Neoscrypt use set_target_neoscrypt() function + if (!safe_cmp(pool->algorithm.name, "neoscrypt")) { + set_target_neoscrypt(work->target, work->sdiff); + } + else { + calc_midstate(work); + set_target(work->target, work->sdiff, pool->algorithm.diff_multiplier2); + } local_work++; work->pool = pool; @@ -6124,15 +6248,17 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool unsigned int options = 0; const char *opt1, *opt2; - if(!pool1 || !pool2) + applog(LOG_DEBUG, "compare_pool_settings()"); + + if (!pool1 || !pool2) return 0; //compare pool devices - opt1 = get_pool_setting(pool1->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all")); - opt2 = get_pool_setting(pool2->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all")); + opt1 = get_pool_setting(pool1->devices, ((!empty_string(default_profile.devices)) ? default_profile.devices : "all")); + opt2 = get_pool_setting(pool2->devices, ((!empty_string(default_profile.devices)) ? default_profile.devices : "all")); //changing devices means a hard reset of mining threads - if(strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) options |= (SWITCHER_APPLY_DEVICE | SWITCHER_HARD_RESET); //compare gpu threads @@ -6140,11 +6266,11 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool opt2 = get_pool_setting(pool2->gpu_threads, default_profile.gpu_threads); //changing gpu threads means a hard reset of mining threads - if(strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) options |= (SWITCHER_APPLY_GT | SWITCHER_HARD_RESET); //compare algorithm - if(!cmp_algorithm(&pool1->algorithm, &pool2->algorithm)) + if (!cmp_algorithm(&pool1->algorithm, &pool2->algorithm)) options |= (SWITCHER_APPLY_ALGO | SWITCHER_SOFT_RESET); //lookup gap @@ -6152,46 +6278,46 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool opt2 = get_pool_setting(pool2->lookup_gap, default_profile.lookup_gap); //lookup gap means soft reset but only if hard reset isnt set - if(strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) options |= (SWITCHER_APPLY_LG | SWITCHER_SOFT_RESET); //intensities opt1 = get_pool_setting(pool1->rawintensity, default_profile.rawintensity); opt2 = get_pool_setting(pool2->rawintensity, default_profile.rawintensity); - if(strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) { //intensity is soft reset - if(!empty_string(opt2)) + if (!empty_string(opt2)) options |= (SWITCHER_APPLY_RAWINT | SWITCHER_SOFT_RESET); } //xintensity -- only if raw intensity not set - if(!opt_isset(options, SWITCHER_APPLY_RAWINT)) + if (!opt_isset(options, SWITCHER_APPLY_RAWINT)) { opt1 = get_pool_setting(pool1->xintensity, default_profile.xintensity); opt2 = get_pool_setting(pool2->xintensity, default_profile.xintensity); //if different... - if(strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) { //intensity is soft reset - if(!empty_string(opt2)) + if (!empty_string(opt2)) options |= (SWITCHER_APPLY_XINT | SWITCHER_SOFT_RESET); } } //intensity -- only if raw intensity and xintensity not set - if(!opt_isset(options, SWITCHER_APPLY_RAWINT) && !opt_isset(options, SWITCHER_APPLY_XINT)) + if (!opt_isset(options, SWITCHER_APPLY_RAWINT) && !opt_isset(options, SWITCHER_APPLY_XINT)) { opt1 = get_pool_setting(pool1->intensity, default_profile.intensity); opt2 = get_pool_setting(pool2->intensity, default_profile.intensity); //if different... - if(strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) { //intensity is soft reset - if(!empty_string(opt2)) + if (!empty_string(opt2)) options |= (SWITCHER_APPLY_INT | SWITCHER_SOFT_RESET); //if blank, set default profile to intensity 8 and apply else @@ -6203,10 +6329,10 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool opt1 = get_pool_setting(pool1->shaders, default_profile.shaders); opt2 = get_pool_setting(pool2->shaders, default_profile.shaders); - if(strcasecmp(opt1, opt2) != 0) + if (strcasecmp(opt1, opt2) != 0) { //shaders is soft reset - if(!empty_string(opt2)) + if (!empty_string(opt2)) options |= (SWITCHER_APPLY_SHADER | SWITCHER_SOFT_RESET); } @@ -6215,7 +6341,7 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool opt2 = get_pool_setting(pool2->thread_concurrency, default_profile.thread_concurrency); //thread-concurrency is soft reset - if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) options |= (SWITCHER_APPLY_TC | SWITCHER_SOFT_RESET); //worksize @@ -6223,45 +6349,45 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool opt2 = get_pool_setting(pool2->worksize, default_profile.worksize); //worksize is soft reset - if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= (SWITCHER_APPLY_WORKSIZE | SWITCHER_SOFT_RESET); + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) + options |= (SWITCHER_APPLY_WORKSIZE | SWITCHER_SOFT_RESET); - #ifdef HAVE_ADL - //gpu-engine - opt1 = get_pool_setting(pool1->gpu_engine, default_profile.gpu_engine); - opt2 = get_pool_setting(pool2->gpu_engine, default_profile.gpu_engine); +#ifdef HAVE_ADL + //gpu-engine + opt1 = get_pool_setting(pool1->gpu_engine, default_profile.gpu_engine); + opt2 = get_pool_setting(pool2->gpu_engine, default_profile.gpu_engine); - if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_ENGINE; + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) + options |= SWITCHER_APPLY_GPU_ENGINE; - //gpu-memclock - opt1 = get_pool_setting(pool1->gpu_memclock, default_profile.gpu_memclock); - opt2 = get_pool_setting(pool2->gpu_memclock, default_profile.gpu_memclock); + //gpu-memclock + opt1 = get_pool_setting(pool1->gpu_memclock, default_profile.gpu_memclock); + opt2 = get_pool_setting(pool2->gpu_memclock, default_profile.gpu_memclock); - if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_MEMCLOCK; + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) + options |= SWITCHER_APPLY_GPU_MEMCLOCK; - //GPU fans - opt1 = get_pool_setting(pool1->gpu_fan, default_profile.gpu_fan); - opt2 = get_pool_setting(pool2->gpu_fan, default_profile.gpu_fan); + //GPU fans + opt1 = get_pool_setting(pool1->gpu_fan, default_profile.gpu_fan); + opt2 = get_pool_setting(pool2->gpu_fan, default_profile.gpu_fan); - if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_FAN; + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) + options |= SWITCHER_APPLY_GPU_FAN; - //GPU powertune - opt1 = get_pool_setting(pool1->gpu_powertune, default_profile.gpu_powertune); - opt2 = get_pool_setting(pool2->gpu_powertune, default_profile.gpu_powertune); + //GPU powertune + opt1 = get_pool_setting(pool1->gpu_powertune, default_profile.gpu_powertune); + opt2 = get_pool_setting(pool2->gpu_powertune, default_profile.gpu_powertune); - if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_POWERTUNE; + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) + options |= SWITCHER_APPLY_GPU_POWERTUNE; - //GPU vddc - opt1 = get_pool_setting(pool1->gpu_vddc, default_profile.gpu_vddc); - opt2 = get_pool_setting(pool2->gpu_vddc, default_profile.gpu_vddc); + //GPU vddc + opt1 = get_pool_setting(pool1->gpu_vddc, default_profile.gpu_vddc); + opt2 = get_pool_setting(pool2->gpu_vddc, default_profile.gpu_vddc); - if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) - options |= SWITCHER_APPLY_GPU_VDDC; - #endif + if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2)) + options |= SWITCHER_APPLY_GPU_VDDC; +#endif // Remove soft reset if hard reset is set if (opt_isset(options, SWITCHER_HARD_RESET) && @@ -6281,6 +6407,8 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work) { int i; + applog(LOG_DEBUG, "get_work_prepare_thread()"); + //if switcher is disabled if(opt_switchmode == SWITCH_OFF) return; @@ -6608,6 +6736,7 @@ struct work *get_work(struct thr_info *thr, const int thr_id) } } + applog(LOG_DEBUG, "preparing thread..."); get_work_prepare_thread(thr, work); diff_t = time(NULL) - diff_t; @@ -6700,7 +6829,16 @@ bool test_nonce(struct work *work, uint32_t nonce) uint32_t diff1targ; rebuild_nonce(work, nonce); - diff1targ = work->pool->algorithm.diff1targ; + + applog(LOG_DEBUG, "test_nonce() algorithm = %s", work->pool->algorithm.name); + + // for Neoscrypt, the diff1targe value is in work->target + if ((work->pool->algorithm.name, "neoscrypt")) { + diff1targ = ((uint32_t *)work->target)[7]; + } + else { + diff1targ = work->pool->algorithm.diff1targ; + } return (le32toh(*hash_32) <= diff1targ); } diff --git a/winbuild/sgminer.vcxproj b/winbuild/sgminer.vcxproj index e6b6ab22..0728c416 100644 --- a/winbuild/sgminer.vcxproj +++ b/winbuild/sgminer.vcxproj @@ -263,6 +263,7 @@ + @@ -321,6 +322,7 @@ + diff --git a/winbuild/sgminer.vcxproj.filters b/winbuild/sgminer.vcxproj.filters index 68d73780..0ba37eab 100644 --- a/winbuild/sgminer.vcxproj.filters +++ b/winbuild/sgminer.vcxproj.filters @@ -197,6 +197,9 @@ Source Files + + Source Files\algorithm + @@ -373,6 +376,9 @@ Header Files\sph + + Header Files\algorithm +