From 580676affbc7b5a2b0cc60356e643174bd3a3861 Mon Sep 17 00:00:00 2001 From: elbandi Date: Mon, 9 Nov 2015 17:15:15 +0100 Subject: [PATCH] Get some algos from 'master' of https://github.com/djm34/sgminer --- Makefile.am | 3 + algorithm.c | 206 ++++- algorithm.h | 6 +- algorithm/credits.c | 148 ++++ algorithm/credits.h | 10 + algorithm/lyra2.c | 22 +- algorithm/lyra2.h | 8 - algorithm/lyra2_old.c | 208 ++++++ algorithm/lyra2_old.h | 50 ++ algorithm/lyra2re.c | 25 +- algorithm/lyra2re.h | 2 + algorithm/lyra2re_old.c | 169 +++++ algorithm/lyra2re_old.h | 10 + algorithm/pluck.h | 2 + algorithm/sponge.c | 22 +- algorithm/sponge.h | 8 +- algorithm/sponge_old.c | 405 ++++++++++ algorithm/sponge_old.h | 98 +++ algorithm/sysendian.h | 140 ++++ algorithm/yescrypt-opt.c | 1364 ++++++++++++++++++++++++++++++++++ algorithm/yescrypt.c | 128 ++++ algorithm/yescrypt.h | 10 + algorithm/yescrypt_core.h | 376 ++++++++++ algorithm/yescryptcommon.c | 360 +++++++++ driver-opencl.c | 13 +- findnonce.c | 6 +- kernel/bmw256.cl | 162 ++++ kernel/credits.cl | 232 ++++++ kernel/cubehash256.cl | 132 ++++ kernel/lyra2rev2.cl | 525 +++++++++++++ kernel/lyra2v2.cl | 184 +++++ kernel/yescrypt-multi.cl | 314 ++++++++ kernel/yescrypt.cl | 253 +++++++ kernel/yescrypt_essential.cl | 760 +++++++++++++++++++ miner.h | 27 +- ocl.c | 242 +++++- ocl.h | 5 +- sgminer.c | 26 +- sph/Makefile.am | 2 +- sph/sha256_Y.c | 418 +++++++++++ sph/sha256_Y.h | 63 ++ 41 files changed, 7083 insertions(+), 61 deletions(-) create mode 100644 algorithm/credits.c create mode 100644 algorithm/credits.h create mode 100644 algorithm/lyra2_old.c create mode 100644 algorithm/lyra2_old.h create mode 100644 algorithm/lyra2re_old.c create mode 100644 algorithm/lyra2re_old.h create mode 100644 algorithm/sponge_old.c create mode 100644 algorithm/sponge_old.h create mode 100644 algorithm/sysendian.h create mode 100644 algorithm/yescrypt-opt.c create mode 100644 algorithm/yescrypt.c create mode 100644 algorithm/yescrypt.h create mode 100644 algorithm/yescrypt_core.h create mode 100644 algorithm/yescryptcommon.c create mode 100644 kernel/bmw256.cl create mode 100644 kernel/credits.cl create mode 100644 kernel/cubehash256.cl create mode 100644 kernel/lyra2rev2.cl create mode 100644 kernel/lyra2v2.cl create mode 100644 kernel/yescrypt-multi.cl create mode 100644 kernel/yescrypt.cl create mode 100644 kernel/yescrypt_essential.cl create mode 100644 sph/sha256_Y.c create mode 100644 sph/sha256_Y.h diff --git a/Makefile.am b/Makefile.am index af6915e8..5da38cc5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -73,7 +73,10 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h sgminer_SOURCES += algorithm/whirlpoolx.c algorithm/whirlpoolx.h sgminer_SOURCES += algorithm/lyra2re.c algorithm/lyra2re.h algorithm/lyra2.c algorithm/lyra2.h algorithm/sponge.c algorithm/sponge.h +sgminer_SOURCES += algorithm/lyra2re_old.c algorithm/lyra2re_old.h algorithm/lyra2_old.c algorithm/lyra2_old.h algorithm/sponge_old.c algorithm/sponge_old.h sgminer_SOURCES += algorithm/pluck.c algorithm/pluck.h +sgminer_SOURCES += algorithm/credits.c algorithm/credits.h +sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h bin_SCRIPTS = $(top_srcdir)/kernel/*.cl diff --git a/algorithm.c b/algorithm.c index fc0ee4a9..6acab924 100644 --- a/algorithm.c +++ b/algorithm.c @@ -33,7 +33,10 @@ #include "algorithm/neoscrypt.h" #include "algorithm/whirlpoolx.h" #include "algorithm/lyra2re.h" +#include "algorithm/lyra2re_old.h" #include "algorithm/pluck.h" +#include "algorithm/yescrypt.h" +#include "algorithm/credits.h" #include "compat.h" @@ -42,6 +45,7 @@ const char *algorithm_type_str[] = { "Unknown", + "Credits", "Scrypt", "NScrypt", "X11", @@ -58,7 +62,10 @@ const char *algorithm_type_str[] = { "Neoscrypt", "WhirlpoolX", "Lyra2RE", + "Lyra2REv2" "Pluck" + "Yescrypt", + "Yescrypt-multi" }; void sha256(const unsigned char *message, unsigned int len, unsigned char *digest) @@ -184,6 +191,125 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb return status; } +static cl_int queue_credits_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; + cl_ulong le_target; + cl_int status = 0; + + + // le_target = (*(cl_uint *)(blk->work->device_target + 24)); + le_target = (cl_ulong)le64toh(((uint64_t *)blk->work->/*device_*/target)[3]); + // le_target = (cl_uint)((uint32_t *)blk->work->target)[6]; + + + memcpy(clState->cldata, blk->work->data, 168); +// flip168(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 168, clState->cldata, 0, NULL, NULL); + + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + CL_SET_ARG(blk->work->midstate); + + return status; +} + +static cl_int queue_yescrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel = &clState->kernel; + unsigned int num = 0; + cl_uint le_target; + cl_int status = 0; + + +// le_target = (*(cl_uint *)(blk->work->device_target + 28)); + le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]); +// le_target = (cl_uint)((uint32_t *)blk->work->target)[7]; + + +// memcpy(clState->cldata, blk->work->data, 80); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->buffer1); + CL_SET_ARG(clState->buffer2); + CL_SET_ARG(le_target); + + return status; +} + +static cl_int queue_yescrypt_multikernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ +// cl_kernel *kernel = &clState->kernel; + cl_kernel *kernel; + unsigned int num = 0; + cl_uint le_target; + cl_int status = 0; + + + // le_target = (*(cl_uint *)(blk->work->device_target + 28)); + le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]); + memcpy(clState->cldata, blk->work->data, 80); +// flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); +//pbkdf and initial sha + kernel = &clState->kernel; + + CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->padbuffer8); + CL_SET_ARG(clState->buffer1); + CL_SET_ARG(clState->buffer2); + CL_SET_ARG(clState->buffer3); + CL_SET_ARG(le_target); + +//inactive kernel + num = 0; + kernel = clState->extra_kernels; + CL_SET_ARG_N(0,clState->buffer1); + CL_SET_ARG_N(1,clState->buffer2); +// CL_SET_ARG_N(3, clState->buffer3); + +//mix2_2 + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->padbuffer8); + CL_SET_ARG_N(1,clState->buffer1); + CL_SET_ARG_N(2,clState->buffer2); + //mix2_2 +//inactive kernel + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1); + CL_SET_ARG_N(1, clState->buffer2); + //mix2_2 + + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->padbuffer8); + CL_SET_ARG_N(1, clState->buffer1); + CL_SET_ARG_N(2, clState->buffer2); + + //inactive kernel + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1); + CL_SET_ARG_N(1, clState->buffer2); + //mix2_2 + + +//pbkdf and finalization + num=0; + CL_NEXTKERNEL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(clState->buffer2); + CL_SET_ARG(clState->buffer3); + CL_SET_ARG(le_target); + + return status; +} + static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) { cl_kernel *kernel = &clState->kernel; @@ -716,6 +842,60 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct return status; } +static cl_int queue_lyra2REv2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel; + unsigned int num; + cl_int status = 0; + cl_ulong le_target; + + // le_target = *(cl_uint *)(blk->work->device_target + 28); + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + // blake - search + kernel = &clState->kernel; + num = 0; + // CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->buffer1); + CL_SET_ARG(blk->work->blk.ctx_a); + CL_SET_ARG(blk->work->blk.ctx_b); + CL_SET_ARG(blk->work->blk.ctx_c); + CL_SET_ARG(blk->work->blk.ctx_d); + CL_SET_ARG(blk->work->blk.ctx_e); + CL_SET_ARG(blk->work->blk.ctx_f); + CL_SET_ARG(blk->work->blk.ctx_g); + CL_SET_ARG(blk->work->blk.ctx_h); + CL_SET_ARG(blk->work->blk.cty_a); + CL_SET_ARG(blk->work->blk.cty_b); + CL_SET_ARG(blk->work->blk.cty_c); + + // keccak - search1 + kernel = clState->extra_kernels; + CL_SET_ARG_0(clState->buffer1); + // cubehash - search2 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // lyra - search3 + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1); + CL_SET_ARG_N(1, clState->padbuffer8); + // skein -search4 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // cubehash - search5 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // bmw - search6 + num = 0; + CL_NEXTKERNEL_SET_ARG(clState->buffer1); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + + return status; +} + static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads) { cl_kernel *kernel = &clState->kernel; @@ -757,6 +937,25 @@ static algorithm_settings_t algos[] = { { a, ALGO_PLUCK, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, pluck_regenhash, queue_pluck_kernel, gen_hash, append_neoscrypt_compiler_options } A_PLUCK("pluck"), #undef A_PLUCK + +#define A_CREDITS(a) \ + { a, ALGO_CRE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, credits_regenhash, queue_credits_kernel, gen_hash, NULL} + A_CREDITS("credits"), +#undef A_CREDITS + + + +#define A_YESCRYPT(a) \ + { a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options} + A_YESCRYPT("yescrypt"), +#undef A_YESCRYPT + +#define A_YESCRYPT_MULTI(a) \ + { a, ALGO_YESCRYPT_MULTI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 6,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, queue_yescrypt_multikernel, gen_hash, append_neoscrypt_compiler_options} + A_YESCRYPT_MULTI("yescrypt-multi"), +#undef A_YESCRYPT_MULTI + + // kernels starting from this will have difficulty calculated by using quarkcoin algorithm #define A_QUARK(a, b) \ { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options } @@ -793,7 +992,10 @@ static algorithm_settings_t algos[] = { { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL }, - { "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2re_regenhash, queue_lyra2RE_kernel, gen_hash, NULL }, + { "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2reold_regenhash, queue_lyra2RE_kernel, gen_hash, NULL }, + + { "lyra2rev2", ALGO_LYRA2REv2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2re_regenhash, queue_lyra2REv2_kernel, gen_hash, append_neoscrypt_compiler_options }, + // kernels starting from this will have difficulty calculated by using fuguecoin algorithm #define A_FUGUE(a, b, c) \ @@ -877,8 +1079,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa ALGO_ALIAS("nist5", "talkcoin-mod"); ALGO_ALIAS("keccak", "maxcoin"); ALGO_ALIAS("whirlpool", "whirlcoin"); - ALGO_ALIAS("Lyra2RE", "lyra2re"); ALGO_ALIAS("lyra2", "lyra2re"); + ALGO_ALIAS("lyra2v2", "lyra2rev2"); #undef ALGO_ALIAS #undef ALGO_ALIAS_NF diff --git a/algorithm.h b/algorithm.h index b2527d17..8b7185a4 100644 --- a/algorithm.h +++ b/algorithm.h @@ -13,6 +13,7 @@ typedef enum { ALGO_UNK, + ALGO_CRE, ALGO_SCRYPT, ALGO_NSCRYPT, ALGO_X11, @@ -29,7 +30,10 @@ typedef enum { ALGO_NEOSCRYPT, ALGO_WHIRLPOOLX, ALGO_LYRA2RE, - ALGO_PLUCK + ALGO_LYRA2REv2, + ALGO_PLUCK, + ALGO_YESCRYPT, + ALGO_YESCRYPT_MULTI, } algorithm_type_t; extern const char *algorithm_type_str[]; diff --git a/algorithm/credits.c b/algorithm/credits.c new file mode 100644 index 00000000..b69514bc --- /dev/null +++ b/algorithm/credits.c @@ -0,0 +1,148 @@ +/*- + * Copyright 2015 djm34 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "sph/sph_sha2.h" + +static const uint32_t diff1targ = 0x0000ffff; + + + +inline void credits_hash(void *state, const void *input) +{ + sph_sha256_context sha1, sha2; + uint32_t hash[8], hash2[8]; + + sph_sha256_init(&sha1); + sph_sha256(&sha1, input, 168); + sph_sha256_close(&sha1, hash); + + + sph_sha256_init(&sha2); + sph_sha256(&sha2, hash, 32); + sph_sha256_close(&sha2, hash2); + + memcpy(state, hash2, 32); + +} +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + +/* Used externally as confirmation of correct OCL code */ +int credits_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[42], ohash[8]; + printf("coming here credits test\n"); + + be32enc_vect(data, (const uint32_t *)pdata, 42); + data[35] = htobe32(nonce); + credits_hash((unsigned char*)data,(unsigned char*)ohash); + + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + + if (tmp_hash7 > diff1targ) + return -1; + + if (tmp_hash7 > Htarg) + return 0; + + return 1; +} + +void credits_regenhash(struct work *work) +{ + uint32_t data[42]; + uint32_t *nonce = (uint32_t *)(work->data + 140); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 42); + data[35] = htobe32(*nonce); + + credits_hash((unsigned char*)ohash, (unsigned char*)data); + +} + + +bool scanhash_credits(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 140); + uint32_t data[42]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 35); + + + while (1) + { + uint32_t ostate[8]; + + *nonce = ++n; + data[35] = (n); + credits_hash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", (long unsigned int)ostate[7]); + + if (unlikely(tmp_hash7 <= Htarg)) + { + ((uint32_t *)pdata)[35] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) + { + *last_nonce = n; + break; + } + } + + return ret; +} \ No newline at end of file diff --git a/algorithm/credits.h b/algorithm/credits.h new file mode 100644 index 00000000..9d74ad20 --- /dev/null +++ b/algorithm/credits.h @@ -0,0 +1,10 @@ +#ifndef CREDITS_H +#define CREDITS_H + +#include "miner.h" + + +extern int credits_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); +extern void credits_regenhash(struct work *work); + +#endif /* CREDITS_H */ diff --git a/algorithm/lyra2.c b/algorithm/lyra2.c index 6944b22f..42640e76 100644 --- a/algorithm/lyra2.c +++ b/algorithm/lyra2.c @@ -58,15 +58,19 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //========== Initializing the Memory Matrix and pointers to it =============// //Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); - uint64_t *wholeMatrix = (uint64_t*)malloc(i); + uint64_t *wholeMatrix = malloc(i); if (wholeMatrix == NULL) { return -1; } memset(wholeMatrix, 0, i); //Allocates pointers to each row of the matrix - uint64_t **memMatrix = (uint64_t**)malloc(nRows * sizeof (uint64_t*)); + uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*)); if (memMatrix == NULL) { return -1; } @@ -118,7 +122,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //======================= Initializing the Sponge State ====================// //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) - uint64_t *state = (uint64_t*)malloc(16 * sizeof (uint64_t)); + uint64_t *state = malloc(16 * sizeof (uint64_t)); if (state == NULL) { return -1; } @@ -130,16 +134,16 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * ptrWord = wholeMatrix; for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil) } //Initializes M[0] and M[1] - reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here - reducedDuplexRow1(state, memMatrix[0], memMatrix[1]); + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); do { //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //updates the value of row* (deterministically picked during Setup)) @@ -172,7 +176,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //------------------------------------------------------------------------------------------ //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //update prev: it now points to the last row ever computed prev = row; @@ -192,7 +196,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * absorbBlock(state, memMatrix[rowa]); //Squeezes the key - squeeze(state, (unsigned char*)K, kLen); + squeeze(state, K, kLen); //==========================================================================/ //========================= Freeing the memory =============================// diff --git a/algorithm/lyra2.h b/algorithm/lyra2.h index 13c7dbd3..798e6af1 100644 --- a/algorithm/lyra2.h +++ b/algorithm/lyra2.h @@ -37,14 +37,6 @@ typedef unsigned char byte; #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes #endif -#ifndef N_COLS - #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default -#endif - -#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks -#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row - - int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); #endif /* LYRA2_H_ */ diff --git a/algorithm/lyra2_old.c b/algorithm/lyra2_old.c new file mode 100644 index 00000000..3b381987 --- /dev/null +++ b/algorithm/lyra2_old.c @@ -0,0 +1,208 @@ +/** + * Implementation of the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include "lyra2_old.h" +#include "sponge_old.h" + +/** + * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords + * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, + * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all + * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value + * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). + * + * @param K The derived key to be output by the algorithm + * @param kLen Desired key length + * @param pwd User password + * @param pwdlen Password length + * @param salt Salt + * @param saltlen Salt length + * @param timeCost Parameter to determine the processing time (T) + * @param nRows Number or rows of the memory matrix (R) + * @param nCols Number of columns of the memory matrix (C) + * + * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) + */ +int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) { + + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); + uint64_t *wholeMatrix = malloc(i); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, i); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*)); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + byte *ptrByte = (byte*) wholeMatrix; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &saltlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &timeCost, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nRows, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nCols, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t *state = malloc(16 * sizeof (uint64_t)); + if (state == NULL) { + return -1; + } + initStateO(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) { + absorbBlockBlake2SafeO(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0O(state, memMatrix[0]); //The locally copied password is most likely overwritten here + reducedDuplexRow1O(state, memMatrix[0], memMatrix[1]); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + reducedDuplexRowSetupO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + //Selects a pseudorandom index row* + //------------------------------------------------------------------------------------------ + //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRowO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + //==========================================================================/ + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlockO(state, memMatrix[rowa]); + + //Squeezes the key + squeezeO(state, K, kLen); + //==========================================================================/ + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + //Wiping out the sponge's internal state before freeing it + memset(state, 0, 16 * sizeof (uint64_t)); + free(state); + //==========================================================================/ + + return 0; +} diff --git a/algorithm/lyra2_old.h b/algorithm/lyra2_old.h new file mode 100644 index 00000000..9dbe5668 --- /dev/null +++ b/algorithm/lyra2_old.h @@ -0,0 +1,50 @@ +/** + * Header file for the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef LYRA2OLD_H_ +#define LYRA2OLD_H_ + +#include + +typedef unsigned char byte; + +//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED) +#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t) +#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes + + +#ifdef BLOCK_LEN_BITS + #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes +#else //default block lenght: 768 bits + #define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes +#endif + +#ifndef N_COLS + #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default +#endif + +#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks +#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row + + +int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); + +#endif /* LYRA2_H_ */ diff --git a/algorithm/lyra2re.c b/algorithm/lyra2re.c index ba37094a..9d89853f 100644 --- a/algorithm/lyra2re.c +++ b/algorithm/lyra2re.c @@ -36,6 +36,8 @@ #include "sph/sph_groestl.h" #include "sph/sph_skein.h" #include "sph/sph_keccak.h" +#include "sph/sph_bmw.h" +#include "sph/sph_cubehash.h" #include "lyra2.h" /* @@ -55,9 +57,10 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) inline void lyra2rehash(void *state, const void *input) { sph_blake256_context ctx_blake; - sph_groestl256_context ctx_groestl; + sph_bmw256_context ctx_bmw; sph_keccak256_context ctx_keccak; sph_skein256_context ctx_skein; + sph_cubehash256_context ctx_cube; uint32_t hashA[8], hashB[8]; @@ -72,17 +75,23 @@ inline void lyra2rehash(void *state, const void *input) sph_keccak256 (&ctx_keccak,hashA, 32); sph_keccak256_close(&ctx_keccak, hashB); - LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashB, 32); + sph_cubehash256_close(&ctx_cube, hashA); + LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); - sph_skein256_init(&ctx_skein); - sph_skein256 (&ctx_skein, hashA, 32); - sph_skein256_close(&ctx_skein, hashB); + sph_skein256_init(&ctx_skein); + sph_skein256 (&ctx_skein, hashB, 32); + sph_skein256_close(&ctx_skein, hashA); + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashA, 32); + sph_cubehash256_close(&ctx_cube, hashB); - sph_groestl256_init(&ctx_groestl); - sph_groestl256 (&ctx_groestl, hashB, 32); - sph_groestl256_close(&ctx_groestl, hashA); + sph_bmw256_init(&ctx_bmw); + sph_bmw256 (&ctx_bmw, hashB, 32); + sph_bmw256_close(&ctx_bmw, hashA); //printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]); diff --git a/algorithm/lyra2re.h b/algorithm/lyra2re.h index 8a58e747..8bc52ac4 100644 --- a/algorithm/lyra2re.h +++ b/algorithm/lyra2re.h @@ -2,6 +2,8 @@ #define LYRA2RE_H #include "miner.h" +#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent +#define LYRA_SECBUF_SIZE (4) // (not used) extern int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); diff --git a/algorithm/lyra2re_old.c b/algorithm/lyra2re_old.c new file mode 100644 index 00000000..3aa4be9a --- /dev/null +++ b/algorithm/lyra2re_old.c @@ -0,0 +1,169 @@ +/*- + * Copyright 2014 James Lovejoy + * Copyright 2014 phm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "lyra2_old.h" + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + + +inline void lyra2rehash_old(void *state, const void *input) +{ + sph_blake256_context ctx_blake; + sph_groestl256_context ctx_groestl; + sph_keccak256_context ctx_keccak; + sph_skein256_context ctx_skein; + + uint32_t hashA[8], hashB[8]; + + sph_blake256_init(&ctx_blake); + sph_blake256 (&ctx_blake, input, 80); + sph_blake256_close (&ctx_blake, hashA); + + + + + sph_keccak256_init(&ctx_keccak); + sph_keccak256 (&ctx_keccak,hashA, 32); + sph_keccak256_close(&ctx_keccak, hashB); + + LYRA2O(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + + + sph_skein256_init(&ctx_skein); + sph_skein256 (&ctx_skein, hashA, 32); + sph_skein256_close(&ctx_skein, hashB); + + + sph_groestl256_init(&ctx_groestl); + sph_groestl256 (&ctx_groestl, hashB, 32); + sph_groestl256_close(&ctx_groestl, hashA); + +//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]); + + memcpy(state, hashA, 32); +} + +static const uint32_t diff1targ = 0x0000ffff; + + +/* Used externally as confirmation of correct OCL code */ +int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + lyra2rehash_old(ohash, data); + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + if (tmp_hash7 > diff1targ) + return -1; + if (tmp_hash7 > Htarg) + return 0; + return 1; +} + +void lyra2reold_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + lyra2rehash_old(ohash, data); +} + +bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while(1) { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + lyra2rehash_old(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", + (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) { + *last_nonce = n; + break; + } + } + + return ret; +} + + + diff --git a/algorithm/lyra2re_old.h b/algorithm/lyra2re_old.h new file mode 100644 index 00000000..0788dfb3 --- /dev/null +++ b/algorithm/lyra2re_old.h @@ -0,0 +1,10 @@ +#ifndef LYRA2REOLD_H +#define LYRA2REOLD_H + +#include "miner.h" + +extern int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, + uint32_t nonce); +extern void lyra2reold_regenhash(struct work *work); + +#endif /* LYRA2RE_H */ diff --git a/algorithm/pluck.h b/algorithm/pluck.h index 7582554e..619eb013 100644 --- a/algorithm/pluck.h +++ b/algorithm/pluck.h @@ -3,6 +3,8 @@ #include "miner.h" #define PLUCK_SCRATCHBUF_SIZE (128 * 1024) +#define PLUCK_SECBUF_SIZE (64 * 1024) + extern int pluck_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); extern void pluck_regenhash(struct work *work); diff --git a/algorithm/sponge.c b/algorithm/sponge.c index e717a508..c788952a 100644 --- a/algorithm/sponge.c +++ b/algorithm/sponge.c @@ -158,11 +158,11 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { * @param state The current state of the sponge * @param rowOut Row to receive the data squeezed */ -void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { - uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] +void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, uint64_t nCols) { + uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] int i; //M[row][C-1-col] = H.reduced_squeeze() - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { ptrWord[0] = state[0]; ptrWord[1] = state[1]; ptrWord[2] = state[2]; @@ -193,12 +193,12 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ -void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { //Absorbing "M[prev][col]" state[0] ^= (ptrWordIn[0]); @@ -253,13 +253,13 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { * @param rowOut Row receiving the output * */ -void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); @@ -327,13 +327,13 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, * @param rowOut Row receiving the output * */ -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); diff --git a/algorithm/sponge.h b/algorithm/sponge.h index 3fcff0d7..19822979 100644 --- a/algorithm/sponge.h +++ b/algorithm/sponge.h @@ -78,16 +78,16 @@ void initState(uint64_t state[/*16*/]); //---- Squeezes void squeeze(uint64_t *state, unsigned char *out, unsigned int len); -void reducedSqueezeRow0(uint64_t* state, uint64_t* row); +void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols); //---- Absorbs void absorbBlock(uint64_t *state, const uint64_t *in); void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); //---- Duplexes -void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); -void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); //---- Misc void printArray(unsigned char *array, unsigned int size, char *name); diff --git a/algorithm/sponge_old.c b/algorithm/sponge_old.c new file mode 100644 index 00000000..7152687f --- /dev/null +++ b/algorithm/sponge_old.c @@ -0,0 +1,405 @@ +/** + * A simple implementation of Blake2b's internal permutation + * in the form of a sponge. + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include "sponge_old.h" +#include "lyra2_old.h" + + + +/** + * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder + * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges + * typically have their internal state initialized with zeros, Blake2b's G function + * has a fixed point: if the internal state and message are both filled with zeros. the + * resulting permutation will always be a block filled with zeros; this happens because + * Blake2b does not use the constants originally employed in Blake2 inside its G function, + * relying on the IV for avoiding possible fixed points. + * + * @param state The 1024-bit array to be initialized + */ +void initStateO(uint64_t state[/*16*/]) { + //First 512 bis are zeros + memset(state, 0, 64); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + state[8] = blake2b_IV[0]; + state[9] = blake2b_IV[1]; + state[10] = blake2b_IV[2]; + state[11] = blake2b_IV[3]; + state[12] = blake2b_IV[4]; + state[13] = blake2b_IV[5]; + state[14] = blake2b_IV[6]; + state[15] = blake2b_IV[7]; +} + +/** + * Execute Blake2b's G function, with all 12 rounds. + * + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +static void blake2bLyra(uint64_t *v) { + ROUND_LYRA(0); + ROUND_LYRA(1); + ROUND_LYRA(2); + ROUND_LYRA(3); + ROUND_LYRA(4); + ROUND_LYRA(5); + ROUND_LYRA(6); + ROUND_LYRA(7); + ROUND_LYRA(8); + ROUND_LYRA(9); + ROUND_LYRA(10); + ROUND_LYRA(11); +} + +/** + * Executes a reduced version of Blake2b's G function with only one round + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +static void reducedBlake2bLyra(uint64_t *v) { + ROUND_LYRA(0); +} + +/** + * Performs a squeeze operation, using Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param out Array that will receive the data squeezed + * @param len The number of bytes to be squeezed into the "out" array + */ +void squeezeO(uint64_t *state, byte *out, unsigned int len) { + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *ptr = out; + int i; + //Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; + } + + //Squeezes remaining bytes + memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words + * of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_INT64 words) + */ +void absorbBlockO(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + state[8] ^= in[8]; + state[9] ^= in[9]; + state[10] ^= in[10]; + state[11] ^= in[11]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 + * words of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) + */ +void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +} + +/** + * Performs a reduced squeeze operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowOut Row to receive the data squeezed + */ +void reducedSqueezeRow0O(uint64_t* state, uint64_t* rowOut) { + uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + int i; + //M[row][C-1-col] = H.reduced_squeeze() + for (i = 0; i < N_COLS; i++) { + ptrWord[0] = state[0]; + ptrWord[1] = state[1]; + ptrWord[2] = state[2]; + ptrWord[3] = state[3]; + ptrWord[4] = state[4]; + ptrWord[5] = state[5]; + ptrWord[6] = state[6]; + ptrWord[7] = state[7]; + ptrWord[8] = state[8]; + ptrWord[9] = state[9]; + ptrWord[10] = state[10]; + ptrWord[11] = state[11]; + + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + } +} + +/** + * Performs a reduced duplex operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowIn Row to feed the sponge + * @param rowOut Row to receive the sponge's output + */ +void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[prev][col]" + state[0] ^= (ptrWordIn[0]); + state[1] ^= (ptrWordIn[1]); + state[2] ^= (ptrWordIn[2]); + state[3] ^= (ptrWordIn[3]); + state[4] ^= (ptrWordIn[4]); + state[5] ^= (ptrWordIn[5]); + state[6] ^= (ptrWordIn[6]); + state[7] ^= (ptrWordIn[7]); + state[8] ^= (ptrWordIn[8]); + state[9] ^= (ptrWordIn[9]); + state[10] ^= (ptrWordIn[10]); + state[11] ^= (ptrWordIn[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][C-1-col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + + //Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left and N_COLS is a system parameter. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][col] = M[rowOut][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } +} + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + Prints an array of unsigned chars + */ +void printArrayO(unsigned char *array, unsigned int size, char *name) { + int i; + printf("%s: ", name); + for (i = 0; i < size; i++) { + printf("%2x|", array[i]); + } + printf("\n"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/algorithm/sponge_old.h b/algorithm/sponge_old.h new file mode 100644 index 00000000..c23781d3 --- /dev/null +++ b/algorithm/sponge_old.h @@ -0,0 +1,98 @@ +/** + * Header file for Blake2b's internal permutation in the form of a sponge. + * This code is based on the original Blake2b's implementation provided by + * Samuel Neves (https://blake2.net/) + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef SPONGEOLD_H_ +#define SPONGEOLD_H_ + +#include + +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + + +/*Blake2b IV Array*/ +static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/*Blake2b's rotation*/ +static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/*Blake2b's G function*/ +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) + + +/*One Round of the Blake2b's compression function*/ +#define ROUND_LYRA(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); + + +//---- Housekeeping +void initStateO(uint64_t state[/*16*/]); + +//---- Squeezes +void squeezeO(uint64_t *state, unsigned char *out, unsigned int len); +void reducedSqueezeRow0O(uint64_t* state, uint64_t* row); + +//---- Absorbs +void absorbBlockO(uint64_t *state, const uint64_t *in); +void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in); + +//---- Duplexes +void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); +void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); + +//---- Misc +void printArrayO(unsigned char *array, unsigned int size, char *name); + +//////////////////////////////////////////////////////////////////////////////////////////////// + + +#endif /* SPONGE_H_ */ diff --git a/algorithm/sysendian.h b/algorithm/sysendian.h new file mode 100644 index 00000000..31ac985f --- /dev/null +++ b/algorithm/sysendian.h @@ -0,0 +1,140 @@ +/*- + * Copyright 2007-2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _SYSENDIAN_H_ +#define _SYSENDIAN_H_ + +/* If we don't have be64enc, the we have isn't usable. */ +#if !HAVE_DECL_BE64ENC +#undef HAVE_SYS_ENDIAN_H +#endif + +#ifdef HAVE_SYS_ENDIAN_H + +#include + +#else + +#include + +#if !HAVE_DECL_LE32DEC +static uint32_t le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} +#endif + +#if !HAVE_DECL_BE32ENC +static void be32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} +#endif + +#if !HAVE_DECL_BE32DEC +static uint32_t be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} +#endif + +#if !HAVE_DECL_LE32ENC +static void le32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} +#endif + +static uint64_t +be64dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) + + ((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) + + ((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) + + ((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56)); +} + +static void +be64enc(void *pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[7] = x & 0xff; + p[6] = (x >> 8) & 0xff; + p[5] = (x >> 16) & 0xff; + p[4] = (x >> 24) & 0xff; + p[3] = (x >> 32) & 0xff; + p[2] = (x >> 40) & 0xff; + p[1] = (x >> 48) & 0xff; + p[0] = (x >> 56) & 0xff; +} + + + +static uint64_t +le64dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + + return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) + + ((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) + + ((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) + + ((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56)); +} + +static void +le64enc(void *pp, uint64_t x) +{ + uint8_t * p = (uint8_t *)pp; + + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; + p[4] = (x >> 32) & 0xff; + p[5] = (x >> 40) & 0xff; + p[6] = (x >> 48) & 0xff; + p[7] = (x >> 56) & 0xff; +} +#endif /* !HAVE_SYS_ENDIAN_H */ + +#endif /* !_SYSENDIAN_H_ */ diff --git a/algorithm/yescrypt-opt.c b/algorithm/yescrypt-opt.c new file mode 100644 index 00000000..b54be469 --- /dev/null +++ b/algorithm/yescrypt-opt.c @@ -0,0 +1,1364 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#ifdef __i386__ +#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Enable at least SSE2 in the C compiler and use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (portability to older CPUs or testing)." +#elif defined(__x86_64__) +#warning "This implementation does not use SIMD, and thus it runs a lot slower than the SIMD-enabled implementation. Use yescrypt-best.c instead unless you're building this SIMD-less implementation on purpose (for testing only)." +#endif + +#include +#include +#include +#include "algorithm/yescrypt_core.h" +#include "sph/sha256_Y.h" +#include "algorithm/sysendian.h" + +// #include "sph/yescrypt-platform.c" +#define HUGEPAGE_THRESHOLD (12 * 1024 * 1024) + +#ifdef __x86_64__ +#define HUGEPAGE_SIZE (2 * 1024 * 1024) +#else +#undef HUGEPAGE_SIZE +#endif + + +static void * +alloc_region(yescrypt_region_t * region, size_t size) +{ + size_t base_size = size; + uint8_t * base, *aligned; +#ifdef MAP_ANON + int flags = +#ifdef MAP_NOCORE + MAP_NOCORE | +#endif + MAP_ANON | MAP_PRIVATE; +#if defined(MAP_HUGETLB) && defined(HUGEPAGE_SIZE) + size_t new_size = size; + const size_t hugepage_mask = (size_t)HUGEPAGE_SIZE - 1; + if (size >= HUGEPAGE_THRESHOLD && size + hugepage_mask >= size) { + flags |= MAP_HUGETLB; + /* + * Linux's munmap() fails on MAP_HUGETLB mappings if size is not a multiple of + * huge page size, so let's round up to huge page size here. + */ + new_size = size + hugepage_mask; + new_size &= ~hugepage_mask; + } + base = mmap(NULL, new_size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (base != MAP_FAILED) { + base_size = new_size; + } + else + if (flags & MAP_HUGETLB) { + flags &= ~MAP_HUGETLB; + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + } + +#else + base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); +#endif + if (base == MAP_FAILED) + base = NULL; + aligned = base; +#elif defined(HAVE_POSIX_MEMALIGN) + if ((errno = posix_memalign((void **)&base, 64, size)) != 0) + base = NULL; + aligned = base; +#else + base = aligned = NULL; + if (size + 63 < size) { + errno = ENOMEM; + } + else if ((base = malloc(size + 63)) != NULL) { + aligned = base + 63; + aligned -= (uintptr_t)aligned & 63; + } +#endif + region->base = base; + region->aligned = aligned; + region->base_size = base ? base_size : 0; + region->aligned_size = base ? size : 0; + return aligned; +} + +static void init_region(yescrypt_region_t * region) +{ + region->base = region->aligned = NULL; + region->base_size = region->aligned_size = 0; +} + +static int +free_region(yescrypt_region_t * region) +{ + if (region->base) { +#ifdef MAP_ANON + if (munmap(region->base, region->base_size)) + return -1; +#else + free(region->base); +#endif + } + init_region(region); + return 0; +} + +int +yescrypt_init_shared(yescrypt_shared_t * shared, +const uint8_t * param, size_t paramlen, +uint64_t N, uint32_t r, uint32_t p, +yescrypt_init_shared_flags_t flags, uint32_t mask, +uint8_t * buf, size_t buflen) +{ + yescrypt_shared1_t * shared1 = &shared->shared1; + yescrypt_shared_t dummy, half1, half2; + // yescrypt_shared_t * half2; + uint8_t salt[32]; + + if (flags & YESCRYPT_SHARED_PREALLOCATED) { + if (!shared1->aligned || !shared1->aligned_size) + return -1; + } + else { + init_region(shared1); + } + shared->mask1 = 1; + if (!param && !paramlen && !N && !r && !p && !buf && !buflen) + return 0; + + init_region(&dummy.shared1); + dummy.mask1 = 1; + if (yescrypt_kdf(&dummy, shared1, + param, paramlen, NULL, 0, N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + salt, sizeof(salt))) + goto out; + + half1 = half2 = *shared; + half1.shared1.aligned_size /= 2; + half2.shared1.aligned_size = half1.shared1.aligned_size; + half2.shared1.aligned = (char*)half2.shared1.aligned + half1.shared1.aligned_size; + + N /= 2; + + if (p > 1 && yescrypt_kdf(&half1, &half2.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_2, + salt, sizeof(salt))) + goto out; + + if (yescrypt_kdf(&half2, &half1.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + salt, sizeof(salt))) + goto out; + + if (yescrypt_kdf(&half1, &half2.shared1, + param, paramlen, salt, sizeof(salt), N, r, p, 0, + YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | __YESCRYPT_INIT_SHARED_1, + buf, buflen)) + goto out; + + shared->mask1 = mask; + + return 0; + +out: + if (!(flags & YESCRYPT_SHARED_PREALLOCATED)) + free_region(shared1); + return -1; +} + +int +yescrypt_free_shared(yescrypt_shared_t * shared) +{ + return free_region(&shared->shared1); +} + +int +yescrypt_init_local(yescrypt_local_t * local) +{ + init_region(local); + return 0; +} + +int +yescrypt_free_local(yescrypt_local_t * local) +{ + return free_region(local); +} + + +static void +blkcpy(uint64_t * dest, const uint64_t * src, size_t count) +{ + do { + *dest++ = *src++; *dest++ = *src++; + *dest++ = *src++; *dest++ = *src++; + } while (count -= 4); +}; + +static void +blkxor(uint64_t * dest, const uint64_t * src, size_t count) +{ + do { + *dest++ ^= *src++; *dest++ ^= *src++; + *dest++ ^= *src++; *dest++ ^= *src++; + } while (count -= 4); +}; + +typedef union { + uint32_t w[16]; + uint64_t d[8]; +} salsa20_blk_t; + +static void +salsa20_simd_shuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); + COMBINE(0, 0, 2) + COMBINE(1, 5, 7) + COMBINE(2, 2, 4) + COMBINE(3, 7, 1) + COMBINE(4, 4, 6) + COMBINE(5, 1, 3) + COMBINE(6, 6, 0) + COMBINE(7, 3, 5) +#undef COMBINE +} + +static void +salsa20_simd_unshuffle(const salsa20_blk_t * Bin, salsa20_blk_t * Bout) +{ +#define COMBINE(out, in1, in2) \ + Bout->w[out * 2] = Bin->d[in1]; \ + Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; + COMBINE(0, 0, 6) + COMBINE(1, 5, 3) + COMBINE(2, 2, 0) + COMBINE(3, 7, 5) + COMBINE(4, 4, 2) + COMBINE(5, 1, 7) + COMBINE(6, 6, 4) + COMBINE(7, 3, 1) +#undef COMBINE +} + +/** + * salsa20_8(B): + * Apply the salsa20/8 core to the provided block. + */ + +static void +salsa20_8(uint64_t B[8]) +{ + size_t i; + salsa20_blk_t X; + +#define x X.w + + salsa20_simd_unshuffle((const salsa20_blk_t *)B, &X); + + for (i = 0; i < 8; i += 2) { +#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns */ + x[ 4] ^= R(x[ 0]+x[12], 7); x[ 8] ^= R(x[ 4]+x[ 0], 9); + x[12] ^= R(x[ 8]+x[ 4],13); x[ 0] ^= R(x[12]+x[ 8],18); + + x[ 9] ^= R(x[ 5]+x[ 1], 7); x[13] ^= R(x[ 9]+x[ 5], 9); + x[ 1] ^= R(x[13]+x[ 9],13); x[ 5] ^= R(x[ 1]+x[13],18); + + x[14] ^= R(x[10]+x[ 6], 7); x[ 2] ^= R(x[14]+x[10], 9); + x[ 6] ^= R(x[ 2]+x[14],13); x[10] ^= R(x[ 6]+x[ 2],18); + + x[ 3] ^= R(x[15]+x[11], 7); x[ 7] ^= R(x[ 3]+x[15], 9); + x[11] ^= R(x[ 7]+x[ 3],13); x[15] ^= R(x[11]+x[ 7],18); + + /* Operate on rows */ + x[ 1] ^= R(x[ 0]+x[ 3], 7); x[ 2] ^= R(x[ 1]+x[ 0], 9); + x[ 3] ^= R(x[ 2]+x[ 1],13); x[ 0] ^= R(x[ 3]+x[ 2],18); + + x[ 6] ^= R(x[ 5]+x[ 4], 7); x[ 7] ^= R(x[ 6]+x[ 5], 9); + x[ 4] ^= R(x[ 7]+x[ 6],13); x[ 5] ^= R(x[ 4]+x[ 7],18); + + x[11] ^= R(x[10]+x[ 9], 7); x[ 8] ^= R(x[11]+x[10], 9); + x[ 9] ^= R(x[ 8]+x[11],13); x[10] ^= R(x[ 9]+x[ 8],18); + + x[12] ^= R(x[15]+x[14], 7); x[13] ^= R(x[12]+x[15], 9); + x[14] ^= R(x[13]+x[12],13); x[15] ^= R(x[14]+x[13],18); +#undef R + } +#undef x + + { + salsa20_blk_t Y; + salsa20_simd_shuffle(&X, &Y); + for (i = 0; i < 16; i += 4) { + ((salsa20_blk_t *)B)->w[i] += Y.w[i]; + ((salsa20_blk_t *)B)->w[i + 1] += Y.w[i + 1]; + ((salsa20_blk_t *)B)->w[i + 2] += Y.w[i + 2]; + ((salsa20_blk_t *)B)->w[i + 3] += Y.w[i + 3]; + } + } +} + +/** + * blockmix_salsa8(Bin, Bout, X, r): + * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r + * bytes in length; the output Bout must also be the same size. The + * temporary space X must be 64 bytes. + */ +static void +blockmix_salsa8(const uint64_t * Bin, uint64_t * Bout, uint64_t * X, size_t r) +{ + size_t i; + + /* 1: X <-- B_{2r - 1} */ + blkcpy(X, &Bin[(2 * r - 1) * 8], 8); + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < 2 * r; i += 2) { + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 8], 8); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 4], X, 8); + + /* 3: X <-- H(X \xor B_i) */ + blkxor(X, &Bin[i * 8 + 8], 8); + salsa20_8(X); + + /* 4: Y_i <-- X */ + /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ + blkcpy(&Bout[i * 4 + r * 8], X, 8); + } + +} + +/* These are tunable */ +#define S_BITS 8 +#define S_SIMD 2 +#define S_P 4 +#define S_ROUNDS 6 + +/* Number of S-boxes. Not tunable, hard-coded in a few places. */ +#define S_N 2 + +/* Derived values. Not tunable on their own. */ +#define S_SIZE1 (1 << S_BITS) +#define S_MASK ((S_SIZE1 - 1) * S_SIMD * 8) +#define S_MASK2 (((uint64_t)S_MASK << 32) | S_MASK) +#define S_SIZE_ALL (S_N * S_SIZE1 * S_SIMD) +#define S_P_SIZE (S_P * S_SIMD) +#define S_MIN_R ((S_P * S_SIMD + 15) / 16) + +/** + * pwxform(B): + * Transform the provided block using the provided S-boxes. + */ + +static void +block_pwxform(uint64_t * B, const uint64_t * S) +{ + uint64_t(*X)[S_SIMD] = (uint64_t(*)[S_SIMD])B; + const uint8_t *S0 = (const uint8_t *)S; + const uint8_t *S1 = (const uint8_t *)(S + S_SIZE1 * S_SIMD); + size_t i, j; + + for (j = 0; j < S_P; j++) { + + uint64_t *Xj = X[j]; + uint64_t x0 = Xj[0]; + uint64_t x1 = Xj[1]; + + for (i = 0; i < S_ROUNDS; i++) { + uint64_t x = x0 & S_MASK2; + const uint64_t *p0, *p1; + + p0 = (const uint64_t *)(S0 + (uint32_t)x); + p1 = (const uint64_t *)(S1 + (x >> 32)); + + x0 = (uint64_t)(x0 >> 32) * (uint32_t)x0; + x0 += p0[0]; + x0 ^= p1[0]; + + x1 = (uint64_t)(x1 >> 32) * (uint32_t)x1; + x1 += p0[1]; + x1 ^= p1[1]; + } + Xj[0] = x0; + Xj[1] = x1; + } + + + +} + + +/** + * blockmix_pwxform(Bin, Bout, S, r): + * Compute Bout = BlockMix_pwxform{salsa20/8, S, r}(Bin). The input Bin must + * be 128r bytes in length; the output Bout must also be the same size. + * + * S lacks const qualifier to match blockmix_salsa8()'s prototype, which we + * need to refer to both functions via the same function pointers. + */ +static void +blockmix_pwxform(const uint64_t * Bin, uint64_t * Bout, uint64_t * S, size_t r) +{ + size_t r1, r2, i; + // S_P_SIZE = 8; + /* Convert 128-byte blocks to (S_P_SIZE * 64-bit) blocks */ + + r1 = r * 128 / (S_P_SIZE * 8); + /* X <-- B_{r1 - 1} */ + blkcpy(Bout, &Bin[(r1 - 1) * S_P_SIZE], S_P_SIZE); + + /* X <-- X \xor B_i */ + blkxor(Bout, Bin, S_P_SIZE); + + /* X <-- H'(X) */ + /* B'_i <-- X */ + block_pwxform(Bout, S); + + /* for i = 0 to r1 - 1 do */ + for (i = 1; i < r1; i++) { + /* X <-- X \xor B_i */ + blkcpy(&Bout[i * S_P_SIZE], &Bout[(i - 1) * S_P_SIZE],S_P_SIZE); + blkxor(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE], S_P_SIZE); + + /* X <-- H'(X) */ + /* B'_i <-- X */ + block_pwxform(&Bout[i * S_P_SIZE], S); + } + + /* Handle partial blocks */ + if (i * S_P_SIZE < r * 16) { + blkcpy(&Bout[i * S_P_SIZE], &Bin[i * S_P_SIZE],r * 16 - i * S_P_SIZE); +} + + i = (r1 - 1) * S_P_SIZE / 8; + /* Convert 128-byte blocks to 64-byte blocks */ + r2 = r * 2; + + /* B'_i <-- H(B'_i) */ + salsa20_8(&Bout[i * 8]); + + + i++; +/// not used yescrypt + + for (; i < r2; i++) { + /* B'_i <-- H(B'_i \xor B'_{i-1}) */ + blkxor(&Bout[i * 8], &Bout[(i - 1) * 8], 8); + salsa20_8(&Bout[i * 8]); + } +} + + + +/** + * integerify(B, r): + * Return the result of parsing B_{2r-1} as a little-endian integer. + */ +static uint64_t +integerify(const uint64_t * B, size_t r) +{ +/* + * Our 64-bit words are in host byte order, and word 6 holds the second 32-bit + * word of B_{2r-1} due to SIMD shuffling. The 64-bit value we return is also + * in host byte order, as it should be. + */ + const uint64_t * X = &B[(2 * r - 1) * 8]; + uint32_t lo = X[0]; + uint32_t hi = X[6] >> 32; + return ((uint64_t)hi << 32) + lo; +} + +/** + * smix1(B, r, N, flags, V, NROM, shared, XY, S): + * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be even and + * no smaller than 2. + */ +static void +smix1(uint64_t * B, size_t r, uint64_t N, yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = (S ? blockmix_pwxform : blockmix_salsa8); + const uint64_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1; + size_t s = 16 * r; + uint64_t * X = V; + uint64_t * Y = &XY[s]; + uint64_t * Z = S ? S : &XY[2 * s]; + uint64_t n, i, j; + size_t k; + + /* 1: X <-- B */ + /* 3: V_i <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + + salsa20_simd_shuffle(tmp, dst); + } + + /* 4: X <-- H(X) */ + /* 3: V_i <-- X */ + + blockmix(X, Y, Z, r); + blkcpy(&V[s], Y, s); + X = XY; + + if (NROM && (VROM_mask & 1)) { + if ((1 & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j = integerify(Y, r) & (NROM - 1); + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } + + blockmix(Y, X, Z, r); + + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if ((i & (i - 1)) == 0) + n <<= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(X, r) & (n - 1); + j += i - n; + + /* X <-- X \xor V_j */ + blkxor(X, &V[j * s], s); + + /* 4: X <-- H(X) */ + blockmix(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + j = integerify(Y, r); + if (((i + 1) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } else { + /* j <-- Wrap(Integerify(X), i) */ + j &= n - 1; + j += i + 1 - n; + + /* X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + } + + blockmix(Y, X, Z, r); + } + } else { + yescrypt_flags_t rw = flags & YESCRYPT_RW; + /* 4: X <-- H(X) */ + blockmix(Y, X, Z, r); + + /* 2: for i = 0 to N - 1 do */ + for (n = 1, i = 2; i < N; i += 2) { + /* 3: V_i <-- X */ + blkcpy(&V[i * s], X, s); + + if (rw) { + if ((i & (i - 1)) == 0) + n <<= 1; + + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(X, r) & (n - 1); + j += i - n; + + /* X <-- X \xor V_j */ + blkxor(X, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + blockmix(X, Y, Z, r); + + /* 3: V_i <-- X */ + blkcpy(&V[(i + 1) * s], Y, s); + + if (rw) { + /* j <-- Wrap(Integerify(X), i) */ + j = integerify(Y, r) & (n - 1); + j += (i + 1) - n; + + + /* X <-- X \xor V_j */ + blkxor(Y, &V[j * s], s); + } + + /* 4: X <-- H(X) */ + blockmix(Y, X, Z, r); + } + } + + /* B' <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + + + +/** + * smix2(B, r, N, Nloop, flags, V, NROM, shared, XY, S): + * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in + * length; the temporary storage V must be 128rN bytes in length; the temporary + * storage XY must be 256r + 64 bytes in length. The value N must be a + * power of 2 greater than 1. The value Nloop must be even. + */ +static void +smix2(uint64_t * B, size_t r, uint64_t N, uint64_t Nloop, + yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + + void (*blockmix)(const uint64_t *, uint64_t *, uint64_t *, size_t) = + (S ? blockmix_pwxform : blockmix_salsa8); + const uint64_t * VROM = shared->shared1.aligned; + uint32_t VROM_mask = shared->mask1 | 1; + size_t s = 16 * r; + yescrypt_flags_t rw = flags & YESCRYPT_RW; + uint64_t * X = XY; + uint64_t * Y = &XY[s]; + uint64_t * Z = S ? S : &XY[2 * s]; + uint64_t i, j; + size_t k; + + if (Nloop == 0) + return; + + /* X <-- B' */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&B[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&X[i * 8]; + for (k = 0; k < 16; k++) + tmp->w[k] = le32dec(&src->w[k]); + salsa20_simd_shuffle(tmp, dst); + } + if (NROM) { + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < Nloop; i += 2) { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], X, s); + blockmix(X, Y, Z, r); + + j = integerify(Y, r); + if (((i + 1) & VROM_mask) == 1) { + /* j <-- Integerify(X) mod NROM */ + j &= NROM - 1; + + /* X <-- H(X \xor VROM_j) */ + blkxor(Y, &VROM[j * s], s); + } else { + /* 7: j <-- Integerify(X) mod N */ + j &= N - 1; + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], Y, s); + } + + blockmix(Y, X, Z, r); + } + } else { + + /* 6: for i = 0 to N - 1 do */ + i = Nloop / 2; + do { + /* 7: j <-- Integerify(X) mod N */ + j = integerify(X, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(X, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], X, s); + blockmix(X, Y, Z, r); + + /* 7: j <-- Integerify(X) mod N */ + j = integerify(Y, r) & (N - 1); + + /* 8: X <-- H(X \xor V_j) */ + blkxor(Y, &V[j * s], s); + /* V_j <-- Xprev \xor V_j */ + if (rw) + blkcpy(&V[j * s], Y, s); + blockmix(Y, X, Z, r); + } while (--i); + } + + /* 10: B' <-- X */ + for (i = 0; i < 2 * r; i++) { + const salsa20_blk_t *src = (const salsa20_blk_t *)&X[i * 8]; + salsa20_blk_t *tmp = (salsa20_blk_t *)Y; + salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 8]; + for (k = 0; k < 16; k++) + le32enc(&tmp->w[k], src->w[k]); + salsa20_simd_unshuffle(tmp, dst); + } +} + + + + +/** + * p2floor(x): + * Largest power of 2 not greater than argument. + */ +static uint64_t +p2floor(uint64_t x) +{ + uint64_t y; + while ((y = x & (x - 1))) + x = y; + return x; +} + +/** + * smix(B, r, N, p, t, flags, V, NROM, shared, XY, S): + * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the + * temporary storage V must be 128rN bytes in length; the temporary storage + * XY must be 256r+64 or (256r+64)*p bytes in length (the larger size is + * required with OpenMP-enabled builds). The value N must be a power of 2 + * greater than 1. + */ +static void +smix(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, + yescrypt_flags_t flags, + uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, + uint64_t * XY, uint64_t * S) +{ + size_t s = 16 * r; + uint64_t Nchunk = N / p, Nloop_all, Nloop_rw; + uint32_t i; + + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } else { + Nloop_all *= t - 1; + } + } else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) + Nloop_rw = Nloop_all; + else if (flags & YESCRYPT_RW) + Nloop_rw = Nloop_all / p; + + Nchunk &= ~(uint64_t)1; /* round down to even */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + Nloop_rw &= ~(uint64_t)1; /* round down to even */ + + + for (i = 0; i < p; i++) { + uint64_t Vchunk = i * Nchunk; + uint64_t * Bp = &B[i * s]; + uint64_t * Vp = &V[Vchunk * s]; + uint64_t * XYp = XY; + + uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + + if (Sp) + smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM,Sp, NROM, shared, XYp, NULL); + + + + if (!(flags & __YESCRYPT_INIT_SHARED_2)) + smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); + + + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp); + + + + } + if (Nloop_all > Nloop_rw) { + + for (i = 0; i < p; i++) { + uint64_t * Bp = &B[i * s]; + + uint64_t * XYp = XY; + + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + smix2(Bp, r, N, Nloop_all - Nloop_rw,flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); + + } + } + + + + +} + +static void +smix_old(uint64_t * B, size_t r, uint64_t N, uint32_t p, uint32_t t, +yescrypt_flags_t flags, +uint64_t * V, uint64_t NROM, const yescrypt_shared_t * shared, +uint64_t * XY, uint64_t * S) +{ + size_t s = 16 * r; + uint64_t Nchunk = N / p, Nloop_all, Nloop_rw; + uint32_t i; + + Nloop_all = Nchunk; + if (flags & YESCRYPT_RW) { + if (t <= 1) { + if (t) + Nloop_all *= 2; /* 2/3 */ + Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ + } + else { + Nloop_all *= t - 1; + } + } + else if (t) { + if (t == 1) + Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ + Nloop_all *= t; + } + + Nloop_rw = 0; + if (flags & __YESCRYPT_INIT_SHARED) + Nloop_rw = Nloop_all; + else if (flags & YESCRYPT_RW) + Nloop_rw = Nloop_all / p; + + Nchunk &= ~(uint64_t)1; /* round down to even */ + Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ + Nloop_rw &= ~(uint64_t)1; /* round down to even */ + + + for (i = 0; i < p; i++) { + uint64_t Vchunk = i * Nchunk; + uint64_t * Bp = &B[i * s]; + uint64_t * Vp = &V[Vchunk * s]; + uint64_t * XYp = XY; + + uint64_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + + if (Sp) { + smix1(Bp, 1, S_SIZE_ALL / 16, flags & ~YESCRYPT_PWXFORM, Sp, NROM, shared, XYp, NULL); + + + } + if (!(flags & __YESCRYPT_INIT_SHARED_2)) { + smix1(Bp, r, Np, flags, Vp, NROM, shared, XYp, Sp); + } + + + smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, NROM, shared, XYp, Sp); + } + + if (Nloop_all > Nloop_rw) { + + for (i = 0; i < p; i++) { + uint64_t * Bp = &B[i * s]; + + uint64_t * XYp = XY; + + uint64_t * Sp = S ? &S[i * S_SIZE_ALL] : S; + smix2(Bp, r, N, Nloop_all - Nloop_rw, flags & ~YESCRYPT_RW, V, NROM, shared, XYp, Sp); + } + } +} + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters r, p, and buflen must satisfy + * r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N must be a power + * of 2 greater than 1. + * + * t controls computation time while not affecting peak memory usage. shared + * and flags may request special modes as described in yescrypt.h. local is + * the thread-local data structure, allowing to preserve and reuse a memory + * allocation across calls, thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + */ +int +yescrypt_kdf(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, + uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, + uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + size_t B_size, V_size, XY_size, need; + uint64_t * B, * V, * XY, * S; + uint64_t sha256[4]; + + /* + * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, + * so don't let it have side-effects. Without this adjustment, it'd + * enable the SHA-256 password pre-hashing and output post-hashing, + * because any deviation from classic scrypt implies those. + */ + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r) * (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) { + errno = EINVAL; + return -1; + } +#if S_MIN_R > 1 + if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) { + errno = EINVAL; + return -1; + } +#endif + if ((p > SIZE_MAX / ((size_t)256 * r + 64)) || +#if SIZE_MAX / 256 <= UINT32_MAX + (r > SIZE_MAX / 256) || +#endif + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } + if (N > UINT64_MAX / ((uint64_t)t + 1)) { + errno = EFBIG; + return -1; + } + + if ((flags & YESCRYPT_PWXFORM) && + p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) { + errno = ENOMEM; + return -1; + } + + NROM = 0; + if (shared->shared1.aligned) { + NROM = shared->shared1.aligned_size / ((size_t)128 * r); + if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; + + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (uint64_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r + 64; + + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_PWXFORM) { + size_t S_size = S_SIZE_ALL * sizeof(*S); + + if (flags & YESCRYPT_PARALLEL_SMIX) + S_size *= p; + + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint64_t *)tmp.aligned; + XY = (uint64_t *)((uint8_t *)B + B_size); + } else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint64_t *)local->aligned; + V = (uint64_t *)((uint8_t *)B + B_size); + XY = (uint64_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_PWXFORM) + S = (uint64_t *)((uint8_t *)XY + XY_size); + + + if (t || flags) { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, passwd, passwdlen); + SHA256_Final_Y((uint8_t *)sha256, &ctx); + passwd = (uint8_t *)sha256; + passwdlen = sizeof(sha256); + } + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1,(uint8_t *)B, B_size); + + if (t || flags) + { + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + } + if (p == 1 || (flags & YESCRYPT_PARALLEL_SMIX)) { + smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); + } else { + uint32_t i; + /* 2: for i = 0 to p - 1 do */ + for (i = 0; i < p; i++) { + /* 3: B_i <-- MF(B_i, N) */ + smix(&B[(size_t)16 * r * i], r, N, 1, t, flags, V, NROM, shared, XY, S); + } + } + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + + PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if ((t || flags) && buflen == sizeof(sha256)) { + /* Compute ClientKey */ + + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, buf, buflen); + HMAC_SHA256_Update_Y(&ctx, salt, saltlen); + HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); + SHA256_Final_Y(buf, &ctx); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} + +int +yescrypt_kdf_old(const yescrypt_shared_t * shared, yescrypt_local_t * local, +const uint8_t * passwd, size_t passwdlen, +const uint8_t * salt, size_t saltlen, +uint64_t N, uint32_t r, uint32_t p, uint32_t t, yescrypt_flags_t flags, +uint8_t * buf, size_t buflen) +{ + yescrypt_region_t tmp; + uint64_t NROM; + size_t B_size, V_size, XY_size, need; + uint64_t * B, *V, *XY, *S; + uint64_t sha256[4]; + + /* + * YESCRYPT_PARALLEL_SMIX is a no-op at p = 1 for its intended purpose, + * so don't let it have side-effects. Without this adjustment, it'd + * enable the SHA-256 password pre-hashing and output post-hashing, + * because any deviation from classic scrypt implies those. + */ + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + /* Sanity-check parameters */ + if (flags & ~YESCRYPT_KNOWN_FLAGS) { + errno = EINVAL; + return -1; + } +#if SIZE_MAX > UINT32_MAX + if (buflen > (((uint64_t)(1) << 32) - 1) * 32) { + errno = EFBIG; + return -1; + } +#endif + if ((uint64_t)(r)* (uint64_t)(p) >= (1 << 30)) { + errno = EFBIG; + return -1; + } + if (((N & (N - 1)) != 0) || (N <= 1) || (r < 1) || (p < 1)) { + errno = EINVAL; + return -1; + } + if ((flags & YESCRYPT_PARALLEL_SMIX) && (N / p <= 1)) { + errno = EINVAL; + return -1; + } +#if S_MIN_R > 1 + if ((flags & YESCRYPT_PWXFORM) && (r < S_MIN_R)) { + errno = EINVAL; + return -1; + } +#endif + if ((p > SIZE_MAX / ((size_t)256 * r + 64)) || +#if SIZE_MAX / 256 <= UINT32_MAX + (r > SIZE_MAX / 256) || +#endif + (N > SIZE_MAX / 128 / r)) { + errno = ENOMEM; + return -1; + } + if (N > UINT64_MAX / ((uint64_t)t + 1)) { + errno = EFBIG; + return -1; + } + + if ((flags & YESCRYPT_PWXFORM) && + p > SIZE_MAX / (S_SIZE_ALL * sizeof(*S))) { + errno = ENOMEM; + return -1; + } + + NROM = 0; + if (shared->shared1.aligned) { + NROM = shared->shared1.aligned_size / ((size_t)128 * r); + if (((NROM & (NROM - 1)) != 0) || (NROM <= 1) || + !(flags & YESCRYPT_RW)) { + errno = EINVAL; + return -1; + } + } + + /* Allocate memory */ + V = NULL; + V_size = (size_t)128 * r * N; + + need = V_size; + if (flags & __YESCRYPT_INIT_SHARED) { + if (local->aligned_size < need) { + if (local->base || local->aligned || + local->base_size || local->aligned_size) { + errno = EINVAL; + return -1; + } + if (!alloc_region(local, need)) + return -1; + } + V = (uint64_t *)local->aligned; + need = 0; + } + B_size = (size_t)128 * r * p; + need += B_size; + if (need < B_size) { + errno = ENOMEM; + return -1; + } + XY_size = (size_t)256 * r + 64; + + need += XY_size; + if (need < XY_size) { + errno = ENOMEM; + return -1; + } + if (flags & YESCRYPT_PWXFORM) { + size_t S_size = S_SIZE_ALL * sizeof(*S); + + if (flags & YESCRYPT_PARALLEL_SMIX) + S_size *= p; + + need += S_size; + if (need < S_size) { + errno = ENOMEM; + return -1; + } + } + if (flags & __YESCRYPT_INIT_SHARED) { + if (!alloc_region(&tmp, need)) + return -1; + B = (uint64_t *)tmp.aligned; + XY = (uint64_t *)((uint8_t *)B + B_size); + } + else { + init_region(&tmp); + if (local->aligned_size < need) { + if (free_region(local)) + return -1; + if (!alloc_region(local, need)) + return -1; + } + B = (uint64_t *)local->aligned; + V = (uint64_t *)((uint8_t *)B + B_size); + XY = (uint64_t *)((uint8_t *)V + V_size); + } + S = NULL; + if (flags & YESCRYPT_PWXFORM) + S = (uint64_t *)((uint8_t *)XY + XY_size); + + + if (t || flags) { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, passwd, passwdlen); + SHA256_Final_Y((uint8_t *)sha256, &ctx); + passwd = (uint8_t *)sha256; + passwdlen = sizeof(sha256); + } + + /* 1: (B_0 ... B_{p-1}) <-- PBKDF2(P, S, 1, p * MFLen) */ + PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, 1, (uint8_t *)B, B_size); + + + if (t || flags) + { + blkcpy(sha256, B, sizeof(sha256) / sizeof(sha256[0])); + } + smix(B, r, N, p, t, flags, V, NROM, shared, XY, S); + + + /* 5: DK <-- PBKDF2(P, B, 1, dkLen) */ + PBKDF2_SHA256(passwd, passwdlen, (uint8_t *)B, B_size, 1, buf, buflen); + + /* + * Except when computing classic scrypt, allow all computation so far + * to be performed on the client. The final steps below match those of + * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so + * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of + * SCRAM's use of SHA-1) would be usable with yescrypt hashes. + */ + if ((t || flags) && buflen == sizeof(sha256)) { + /* Compute ClientKey */ + + { + HMAC_SHA256_CTX_Y ctx; + HMAC_SHA256_Init_Y(&ctx, buf, buflen); + HMAC_SHA256_Update_Y(&ctx, salt, saltlen); + HMAC_SHA256_Final_Y((uint8_t *)sha256, &ctx); + } + /* Compute StoredKey */ + { + SHA256_CTX_Y ctx; + SHA256_Init_Y(&ctx); + SHA256_Update_Y(&ctx, (uint8_t *)sha256, sizeof(sha256)); + SHA256_Final_Y(buf, &ctx); + } + } + + if (free_region(&tmp)) + return -1; + + /* Success! */ + return 0; +} + diff --git a/algorithm/yescrypt.c b/algorithm/yescrypt.c new file mode 100644 index 00000000..de00d0f3 --- /dev/null +++ b/algorithm/yescrypt.c @@ -0,0 +1,128 @@ +/*- + * Copyright 2015 djm34 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "algorithm/yescrypt_core.h" + +static const uint32_t diff1targ = 0x0000ffff; + +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + +/* Used externally as confirmation of correct OCL code */ +int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + yescrypt_hash((unsigned char*)data,(unsigned char*)ohash); + + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + + if (tmp_hash7 > diff1targ) + return -1; + + if (tmp_hash7 > Htarg) + return 0; + + return 1; +} + +void yescrypt_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + + yescrypt_hash((unsigned char*)data, (unsigned char*)ohash); + +} + + +bool scanhash_yescrypt(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while (1) + { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + + yescrypt_hash((unsigned char*)data, (unsigned char*)ostate); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) + { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) + { + *last_nonce = n; + break; + } + } + + return ret; +} \ No newline at end of file diff --git a/algorithm/yescrypt.h b/algorithm/yescrypt.h new file mode 100644 index 00000000..b51cb495 --- /dev/null +++ b/algorithm/yescrypt.h @@ -0,0 +1,10 @@ +#ifndef YESCRYPT_H +#define YESCRYPT_H + +#include "miner.h" +#define YESCRYPT_SCRATCHBUF_SIZE (128 * 2048 * 8 ) //uchar +#define YESCRYP_SECBUF_SIZE (128*64*8) +extern int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); +extern void yescrypt_regenhash(struct work *work); + +#endif /* YESCRYPT_H */ diff --git a/algorithm/yescrypt_core.h b/algorithm/yescrypt_core.h new file mode 100644 index 00000000..64b9a11f --- /dev/null +++ b/algorithm/yescrypt_core.h @@ -0,0 +1,376 @@ +/*- + * Copyright 2009 Colin Percival + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ +#ifndef _YESCRYPT_H_ +#define _YESCRYPT_H_ + +#include +#include /* for size_t */ +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +//extern void yescrypt_hash_sp(const unsigned char *input, unsigned char *output); +extern void yescrypt_hash(const unsigned char *input, unsigned char *output); + + + +/** + * crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen) and write the result into buf. The parameters r, p, and buflen + * must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N + * must be a power of 2 greater than 1. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as buf is local to the thread. + */ +extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __salt, size_t __saltlen, + uint64_t __N, uint32_t __r, uint32_t __p, + uint8_t * __buf, size_t __buflen); + +/** + * Internal type used by the memory allocator. Please do not use it directly. + * Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since + * they might differ from each other in a future version. + */ +typedef struct { + void * base, * aligned; + size_t base_size, aligned_size; +} yescrypt_region_t; + +/** + * Types for shared (ROM) and thread-local (RAM) data structures. + */ +typedef yescrypt_region_t yescrypt_shared1_t; +typedef struct { + yescrypt_shared1_t shared1; + uint32_t mask1; +} yescrypt_shared_t; +typedef yescrypt_region_t yescrypt_local_t; + +/** + * Possible values for yescrypt_init_shared()'s flags argument. + */ +typedef enum { + YESCRYPT_SHARED_DEFAULTS = 0, + YESCRYPT_SHARED_PREALLOCATED = 0x100 +} yescrypt_init_shared_flags_t; + +/** + * Possible values for the flags argument of yescrypt_kdf(), + * yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together, + * except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive. + * Please refer to the description of yescrypt_kdf() below for the meaning of + * these flags. + */ +typedef enum { +/* public */ + YESCRYPT_WORM = 0, + YESCRYPT_RW = 1, + YESCRYPT_PARALLEL_SMIX = 2, + YESCRYPT_PWXFORM = 4, +/* private */ + __YESCRYPT_INIT_SHARED_1 = 0x10000, + __YESCRYPT_INIT_SHARED_2 = 0x20000, + __YESCRYPT_INIT_SHARED = 0x30000 +} yescrypt_flags_t; + +#define YESCRYPT_KNOWN_FLAGS \ + (YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \ + __YESCRYPT_INIT_SHARED) + +/** + * yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask, + * buf, buflen): + * Optionally allocate memory for and initialize the shared (ROM) data + * structure. The parameters N, r, and p must satisfy the same conditions as + * with crypto_scrypt(). param and paramlen specify a local parameter with + * which the ROM is seeded. If buf is not NULL, then it is used to return + * buflen bytes of message digest for the initialized ROM (the caller may use + * this to verify that the ROM has been computed in the same way that it was on + * a previous run). + * + * Return 0 on success; or -1 on error. + * + * If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the + * ROM is assumed to have been preallocated by the caller, with + * shared->shared1.aligned being the start address of the ROM and + * shared->shared1.aligned_size being its size (which must be consistent with + * N, r, and p). This may be used e.g. when the ROM is to be placed in a SysV + * shared memory segment allocated by the caller. + * + * mask controls the frequency of ROM accesses by yescrypt_kdf(). Normally it + * should be set to 1, to interleave RAM and ROM accesses, which works well + * when both regions reside in the machine's RAM anyway. Other values may be + * used e.g. when the ROM is memory-mapped from a disk file. Recommended mask + * values are powers of 2 minus 1 or minus 2. Here's the effect of some mask + * values: + * mask value ROM accesses in SMix 1st loop ROM accesses in SMix 2nd loop + * 0 0 1/2 + * 1 1/2 1/2 + * 2 0 1/4 + * 3 1/4 1/4 + * 6 0 1/8 + * 7 1/8 1/8 + * 14 0 1/16 + * 15 1/16 1/16 + * 1022 0 1/1024 + * 1023 1/1024 1/1024 + * + * Actual computation of the ROM contents may be avoided, if you don't intend + * to use a ROM but need a dummy shared structure, by calling this function + * with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the + * arguments starting with param and on. + * + * MT-safe as long as shared is local to the thread. + */ +extern int yescrypt_init_shared(yescrypt_shared_t * __shared, + const uint8_t * __param, size_t __paramlen, + uint64_t __N, uint32_t __r, uint32_t __p, + yescrypt_init_shared_flags_t __flags, uint32_t __mask, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_free_shared(shared): + * Free memory that had been allocated with yescrypt_init_shared(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as shared is local to the thread. + */ +extern int yescrypt_free_shared(yescrypt_shared_t * __shared); + +/** + * yescrypt_init_local(local): + * Initialize the thread-local (RAM) data structure. Actual memory allocation + * is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r(). + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yescrypt_init_local(yescrypt_local_t * __local); + +/** + * yescrypt_free_local(local): + * Free memory that may have been allocated for an initialized thread-local + * (RAM) data structure. + * + * Return 0 on success; or -1 on error. + * + * MT-safe as long as local is local to the thread. + */ +extern int yescrypt_free_local(yescrypt_local_t * __local); + +/** + * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + * N, r, p, t, flags, buf, buflen): + * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, + * p, buflen), or a revision of scrypt as requested by flags and shared, and + * write the result into buf. The parameters N, r, p, and buflen must satisfy + * the same conditions as with crypto_scrypt(). t controls computation time + * while not affecting peak memory usage. shared and flags may request + * special modes as described below. local is the thread-local data + * structure, allowing to preserve and reuse a memory allocation across calls, + * thereby reducing its overhead. + * + * Return 0 on success; or -1 on error. + * + * t controls computation time. t = 0 is optimal in terms of achieving the + * highest area-time for ASIC attackers. Thus, higher computation time, if + * affordable, is best achieved by increasing N rather than by increasing t. + * However, if the higher memory usage (which goes along with higher N) is not + * affordable, or if fine-tuning of the time is needed (recall that N must be a + * power of 2), then t = 1 or above may be used to increase time while staying + * at the same peak memory usage. t = 1 increases the time by 25% and + * decreases the normalized area-time to 96% of optimal. (Of course, in + * absolute terms the area-time increases with higher t. It's just that it + * would increase slightly more with higher N*r rather than with higher t.) + * t = 2 increases the time by another 20% and decreases the normalized + * area-time to 89% of optimal. Thus, these two values are reasonable to use + * for fine-tuning. Values of t higher than 2 result in further increase in + * time while reducing the efficiency much further (e.g., down to around 50% of + * optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact + * numbers varying by the flags settings). + * + * Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and + * passing a dummy shared structure (see the description of + * yescrypt_init_shared() above for how to produce one). In this mode, the + * thread-local memory region (RAM) is first sequentially written to and then + * randomly read from. This algorithm is friendly towards time-memory + * tradeoffs (TMTO), available both to defenders (albeit not in this + * implementation) and to attackers. + * + * Setting YESCRYPT_RW adds extra random reads and writes to the thread-local + * memory region (RAM), which makes TMTO a lot less efficient. This may be + * used to slow down the kinds of attackers who would otherwise benefit from + * classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not + * only for the tradeoff, but also for a decrease of attacker's area-time (by + * up to a constant factor), setting YESCRYPT_RW substantially increases the + * cost of attacks in area-time terms as well. Yet another benefit of it is + * that optimal area-time is reached at an earlier time than with classic + * scrypt, and t = 0 actually corresponds to this earlier completion time, + * resulting in quicker hash computations (and thus in higher request rate + * capacity). Due to these properties, YESCRYPT_RW should almost always be + * set, except when compatibility with classic scrypt or TMTO-friendliness are + * desired. + * + * YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a + * lower level as compared to where it is in classic scrypt. This reduces + * flexibility for efficient computation (for both attackers and defenders) by + * requiring that, short of resorting to TMTO, the full amount of memory be + * allocated as needed for the specified p, regardless of whether that + * parallelism is actually being fully made use of or not. (For comparison, a + * single instance of classic scrypt may be computed in less memory without any + * CPU time overhead, but in more real time, by not making full use of the + * parallelism.) This may be desirable when the defender has enough memory + * with sufficiently low latency and high bandwidth for efficient full parallel + * execution, yet the required memory size is high enough that some likely + * attackers might end up being forced to choose between using higher latency + * memory than they could use otherwise (waiting for data longer) or using TMTO + * (waiting for data more times per one hash computation). The area-time cost + * for other kinds of attackers (who would use the same memory type and TMTO + * factor or no TMTO either way) remains roughly the same, given the same + * running time for the defender. In the TMTO-friendly YESCRYPT_WORM mode, as + * long as the defender has enough memory that is just as fast as the smaller + * per-thread regions would be, doesn't expect to ever need greater + * flexibility (except possibly via TMTO), and doesn't need backwards + * compatibility with classic scrypt, there are no other serious drawbacks to + * this setting. In the YESCRYPT_RW mode, which is meant to discourage TMTO, + * this new approach to parallelization makes TMTO less inefficient. (This is + * an unfortunate side-effect of avoiding some random writes, as we have to in + * order to allow for parallel threads to access a common memory region without + * synchronization overhead.) Thus, in this mode this setting poses an extra + * tradeoff of its own (higher area-time cost for a subset of attackers vs. + * better TMTO resistance). Setting YESCRYPT_PARALLEL_SMIX also changes the + * way the running time is to be controlled from N*r*p (for classic scrypt) to + * N*r (in this modification). All of this applies only when p > 1. For + * p = 1, this setting is a no-op. + * + * Passing a real shared structure, with ROM contents previously computed by + * yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for + * the thread-local RAM region. In order to allow for initialization of the + * ROM to be split into a separate program, the shared->shared1.aligned and + * shared->shared1.aligned_size fields may be set by the caller of + * yescrypt_kdf() manually rather than with yescrypt_init_shared(). + * + * local must be initialized with yescrypt_init_local(). + * + * MT-safe as long as local and buf are local to the thread. + */ +extern int yescrypt_kdf(const yescrypt_shared_t * __shared, + yescrypt_local_t * __local, + const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __salt, size_t __saltlen, + uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t, + yescrypt_flags_t __flags, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen): + * Compute and encode an scrypt or enhanced scrypt hash of passwd given the + * parameters and salt value encoded in setting. If the shared structure is + * not dummy, a ROM is used and YESCRYPT_RW is required. Otherwise, whether to + * use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff + * discouraging modification) is determined by the setting string. shared and + * local must be initialized as described above for yescrypt_kdf(). buf must + * be large enough (as indicated by buflen) to hold the encoded hash string. + * + * Return the encoded hash string on success; or NULL on error. + * + * MT-safe as long as local and buf are local to the thread. + */ +extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared, + yescrypt_local_t * __local, + const uint8_t * __passwd, size_t __passwdlen, + const uint8_t * __setting, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt(passwd, setting): + * Compute and encode an scrypt or enhanced scrypt hash of passwd given the + * parameters and salt value encoded in setting. Whether to use the + * YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff + * discouraging modification) is determined by the setting string. + * + * Return the encoded hash string on success; or NULL on error. + * + * This is a crypt(3)-like interface, which is simpler to use than + * yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM, + * and it is slower than yescrypt_r() for repeated calls because it allocates + * and frees memory on each call. + * + * MT-unsafe. + */ +extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting); + +/** + * yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen): + * Generate a setting string for use with yescrypt_r() and yescrypt() by + * encoding into it the parameters N_log2 (which is to be set to base 2 + * logarithm of the desired value for N), r, p, flags, and a salt given by src + * (of srclen bytes). buf must be large enough (as indicated by buflen) to + * hold the setting string. + * + * Return the setting string on success; or NULL on error. + * + * MT-safe as long as buf is local to the thread. + */ +extern uint8_t * yescrypt_gensalt_r( + uint32_t __N_log2, uint32_t __r, uint32_t __p, + yescrypt_flags_t __flags, + const uint8_t * __src, size_t __srclen, + uint8_t * __buf, size_t __buflen); + +/** + * yescrypt_gensalt(N_log2, r, p, flags, src, srclen): + * Generate a setting string for use with yescrypt_r() and yescrypt(). This + * function is the same as yescrypt_gensalt_r() except that it uses a static + * buffer and thus is not MT-safe. + * + * Return the setting string on success; or NULL on error. + * + * MT-unsafe. + */ +extern uint8_t * yescrypt_gensalt( + uint32_t __N_log2, uint32_t __r, uint32_t __p, + yescrypt_flags_t __flags, + const uint8_t * __src, size_t __srclen); + +#ifdef __cplusplus +} +#endif + +#endif /* !_YESCRYPT_H_ */ diff --git a/algorithm/yescryptcommon.c b/algorithm/yescryptcommon.c new file mode 100644 index 00000000..cf7067d0 --- /dev/null +++ b/algorithm/yescryptcommon.c @@ -0,0 +1,360 @@ +/*- + * Copyright 2013,2014 Alexander Peslyak + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include "algorithm/yescrypt_core.h" + +#define BYTES2CHARS(bytes) \ + ((((bytes) * 8) + 5) / 6) + +#define HASH_SIZE 32 /* bytes */ +#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */ +#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM) +static const char * const itoa64 = + "./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + +static uint8_t * encode64_uint32(uint8_t * dst, size_t dstlen, + uint32_t src, uint32_t srcbits) +{ + uint32_t bit; + + for (bit = 0; bit < srcbits; bit += 6) { + if (dstlen < 1) + return NULL; + *dst++ = itoa64[src & 0x3f]; + dstlen--; + src >>= 6; + } + + return dst; +} + +static uint8_t * encode64(uint8_t * dst, size_t dstlen, + const uint8_t * src, size_t srclen) +{ + size_t i; + + for (i = 0; i < srclen; ) { + uint8_t * dnext; + uint32_t value = 0, bits = 0; + do { + value |= (uint32_t)src[i++] << bits; + bits += 8; + } while (bits < 24 && i < srclen); + dnext = encode64_uint32(dst, dstlen, value, bits); + if (!dnext) + return NULL; + dstlen -= dnext - dst; + dst = dnext; + } + + return dst; +} + +static int decode64_one(uint32_t * dst, uint8_t src) +{ + const char * ptr = strchr(itoa64, src); + if (ptr) { + *dst = ptr - itoa64; + return 0; + } + *dst = 0; + return -1; +} + +static const uint8_t * decode64_uint32(uint32_t * dst, uint32_t dstbits, + const uint8_t * src) +{ + uint32_t bit; + uint32_t value; + + value = 0; + for (bit = 0; bit < dstbits; bit += 6) { + uint32_t one; + if (decode64_one(&one, *src)) { + *dst = 0; + return NULL; + } + src++; + value |= one << bit; + } + + *dst = value; + return src; +} + +uint8_t * +yescrypt_r(const yescrypt_shared_t * shared, yescrypt_local_t * local, + const uint8_t * passwd, size_t passwdlen, + const uint8_t * setting, + uint8_t * buf, size_t buflen) +{ + uint8_t hash[HASH_SIZE]; + const uint8_t * src, * salt; + uint8_t * dst; + size_t prefixlen, saltlen, need; + uint8_t version; + uint64_t N; + uint32_t r, p; + yescrypt_flags_t flags = YESCRYPT_WORM; + fflush(stdout); + if (setting[0] != '$' || setting[1] != '7') + { + fflush(stdout); + return NULL; + } + fflush(stdout); + src = setting + 2; + fflush(stdout); + switch ((version = *src)) { + case '$': + fflush(stdout); + break; + case 'X': + src++; + flags = YESCRYPT_RW; + fflush(stdout); + break; + default: + { + fflush(stdout); + return NULL; + } + } + + fflush(stdout); + if (*src != '$') { + uint32_t decoded_flags; + if (decode64_one(&decoded_flags, *src)) + + { + fflush(stdout); + return NULL; + } + flags = decoded_flags; + if (*++src != '$') + { + fflush(stdout); + return NULL; + } + } + src++; + + { + uint32_t N_log2; + if (decode64_one(&N_log2, *src)) + { + return NULL; + } + src++; + N = (uint64_t)1 << N_log2; + } + + src = decode64_uint32(&r, 30, src); + if (!src) + { + return NULL; + } + + src = decode64_uint32(&p, 30, src); + if (!src) + { + return NULL; + } + + prefixlen = src - setting; + + salt = src; + src = (uint8_t *)strrchr((char *)salt, '$'); + if (src) + saltlen = src - salt; + else + saltlen = strlen((char *)salt); + + need = prefixlen + saltlen + 1 + HASH_LEN + 1; + if (need > buflen || need < saltlen) + + { + fflush(stdout); + return NULL; + } + +fflush(stdout); + if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, + N, r, p, 0, flags, hash, sizeof(hash))) + { + fflush(stdout); + return NULL; + } + + dst = buf; + memcpy(dst, setting, prefixlen + saltlen); + dst += prefixlen + saltlen; + *dst++ = '$'; + + dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash)); + /* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its + * memory allocations yet anyway. */ + if (!dst || dst >= buf + buflen) /* Can't happen */ + { + return NULL; + } + + *dst = 0; /* NUL termination */ + fflush(stdout); + return buf; +} + +uint8_t * +yescrypt(const uint8_t * passwd, const uint8_t * setting) +{ + static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1]; + yescrypt_shared_t shared; + yescrypt_local_t local; + uint8_t * retval; + if (yescrypt_init_shared(&shared, NULL, 0, + 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) + return NULL; + if (yescrypt_init_local(&local)) { + yescrypt_free_shared(&shared); + return NULL; + } + retval = yescrypt_r(&shared, &local, + passwd, 80, setting, buf, sizeof(buf)); + // printf("hashse='%s'\n", (char *)retval); + if (yescrypt_free_local(&local)) { + yescrypt_free_shared(&shared); + return NULL; + } + if (yescrypt_free_shared(&shared)) + return NULL; + return retval; + +} + +uint8_t * +yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p, + yescrypt_flags_t flags, + const uint8_t * src, size_t srclen, + uint8_t * buf, size_t buflen) +{ + uint8_t * dst; + size_t prefixlen = 3 + 1 + 5 + 5; + size_t saltlen = BYTES2CHARS(srclen); + size_t need; + + if (p == 1) + flags &= ~YESCRYPT_PARALLEL_SMIX; + + if (flags) { + if (flags & ~0x3f) + return NULL; + + prefixlen++; + if (flags != YESCRYPT_RW) + prefixlen++; + } + + need = prefixlen + saltlen + 1; + if (need > buflen || need < saltlen || saltlen < srclen) + return NULL; + + if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30))) + return NULL; + + dst = buf; + *dst++ = '$'; + *dst++ = '7'; + if (flags) { + *dst++ = 'X'; /* eXperimental, subject to change */ + if (flags != YESCRYPT_RW) + *dst++ = itoa64[flags]; + } + *dst++ = '$'; + + *dst++ = itoa64[N_log2]; + + dst = encode64_uint32(dst, buflen - (dst - buf), r, 30); + if (!dst) /* Can't happen */ + return NULL; + + dst = encode64_uint32(dst, buflen - (dst - buf), p, 30); + if (!dst) /* Can't happen */ + return NULL; + + dst = encode64(dst, buflen - (dst - buf), src, srclen); + if (!dst || dst >= buf + buflen) /* Can't happen */ + return NULL; + + *dst = 0; /* NUL termination */ + + return buf; +} + +uint8_t * +yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p, + yescrypt_flags_t flags, + const uint8_t * src, size_t srclen) +{ + static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1]; + return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, + buf, sizeof(buf)); +} + +static int +yescrypt_bsty(const uint8_t * passwd, size_t passwdlen, + const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p, + uint8_t * buf, size_t buflen) +{ + static __thread int initialized = 0; + static __thread yescrypt_shared_t shared; + static __thread yescrypt_local_t local; + +// static __declspec(thread) int initialized = 0; +// static __declspec(thread) yescrypt_shared_t shared; +// static __declspec(thread) yescrypt_local_t local; + + int retval; + if (!initialized) { +/* "shared" could in fact be shared, but it's simpler to keep it private + * along with "local". It's dummy and tiny anyway. */ + if (yescrypt_init_shared(&shared, NULL, 0, + 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0)) + return -1; + if (yescrypt_init_local(&local)) { + yescrypt_free_shared(&shared); + return -1; + } + initialized = 1; + } + retval = yescrypt_kdf(&shared, &local, + passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS, + buf, buflen); + + return retval; +} + +void yescrypt_hash(const unsigned char *input, unsigned char *output) +{ + + yescrypt_bsty((const uint8_t *)input, 80, (const uint8_t *) input, 80, 2048, 8, 1, (uint8_t *)output, 32); +} diff --git a/driver-opencl.c b/driver-opencl.c index 72ee9559..f0a88a6f 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -257,14 +257,14 @@ char *set_gpu_threads(const char *_arg) if (nextptr == NULL) return "Invalid parameters for set_gpu_threads"; val = atoi(nextptr); - if (val < 1 || val > 10) + if (val < 1 || val > 20) // gpu_threads increase max value to 20 return "Invalid value passed to set_gpu_threads"; gpus[device++].threads = val; while ((nextptr = strtok(NULL, ",")) != NULL) { val = atoi(nextptr); - if (val < 1 || val > 10) + if (val < 1 || val > 20) // gpu_threads increase max value to 20 return "Invalid value passed to set_gpu_threads"; gpus[device++].threads = val; @@ -1472,6 +1472,9 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, } applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id); postcalc_hash_async(thr, work, thrdata->res); +// postcalc_hash(thr); +// submit_tested_work(thr, work); +// submit_work_async(work); memset(thrdata->res, 0, buffersize); /* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */ clFinish(clState->commandQueue); @@ -1493,6 +1496,12 @@ static void opencl_thread_shutdown(struct thr_info *thr) clFinish(clState->commandQueue); clReleaseMemObject(clState->outputBuffer); clReleaseMemObject(clState->CLbuffer0); + if (clState->buffer1) + clReleaseMemObject(clState->buffer1); + if (clState->buffer2) + clReleaseMemObject(clState->buffer2); + if (clState->buffer3) + clReleaseMemObject(clState->buffer3); if (clState->padbuffer8) clReleaseMemObject(clState->padbuffer8); clReleaseKernel(clState->kernel); diff --git a/findnonce.c b/findnonce.c index c2402997..8858cfa6 100644 --- a/findnonce.c +++ b/findnonce.c @@ -214,6 +214,7 @@ static void *postcalc_hash(void *userdata) void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res) { + struct pc_data *pcd = (struct pc_data *)malloc(sizeof(struct pc_data)); int buffersize; @@ -225,8 +226,7 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res) pcd->thr = thr; pcd->work = copy_work(work); buffersize = BUFFERSIZE; - - memcpy(&pcd->res, res, buffersize); + memcpy(&pcd->res, res, buffersize); if (pthread_create(&pcd->pth, NULL, postcalc_hash, (void *)pcd)) { applog(LOG_ERR, "Failed to create postcalc_hash thread"); @@ -366,4 +366,4 @@ void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) blk->cty_a = data[16]; blk->cty_b = data[17]; blk->cty_c = data[18]; -} \ No newline at end of file +} diff --git a/kernel/bmw256.cl b/kernel/bmw256.cl new file mode 100644 index 00000000..19c85cbc --- /dev/null +++ b/kernel/bmw256.cl @@ -0,0 +1,162 @@ +/* +* bmw256 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2015 djm34 +* +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + + + +#define shl(x, n) ((x) << (n)) +#define shr(x, n) ((x) >> (n)) +//#define SHR(x, n) SHR2(x, n) +//#define SHL(x, n) SHL2(x, n) + + +#define SPH_ROTL32(x,n) rotate(x,(uint)n) +#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19)) +#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23)) +#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25)) +#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29)) +#define ss4(x) (shr((x), 1) ^ (x)) +#define ss5(x) (shr((x), 2) ^ (x)) +#define rs1(x) SPH_ROTL32((x), 3) +#define rs2(x) SPH_ROTL32((x), 7) +#define rs3(x) SPH_ROTL32((x), 13) +#define rs4(x) SPH_ROTL32((x), 16) +#define rs5(x) SPH_ROTL32((x), 19) +#define rs6(x) SPH_ROTL32((x), 23) +#define rs7(x) SPH_ROTL32((x), 27) + +/* Message expansion function 1 */ +uint expand32_1(int i, uint *M32, uint *H, uint *Q) +{ + + return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13]) + + ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9]) + + ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5]) + + ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); + +} + +/* Message expansion function 2 */ +uint expand32_2(int i, uint *M32, uint *H, uint *Q) +{ + + return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13]) + + Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9]) + + Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5]) + + Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); + +} + +void Compression256(uint *M32, uint *H) +{ + + int i; + uint XL32, XH32, Q[32]; + + + Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]); + Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]); + Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]); + Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]); + Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]); + Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]); + Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]); + Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]); + Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]); + Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]); + Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]); + Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]); + Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]); + Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]); + + /* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/ + Q[0] = ss0(Q[0]) + H[1]; + Q[1] = ss1(Q[1]) + H[2]; + Q[2] = ss2(Q[2]) + H[3]; + Q[3] = ss3(Q[3]) + H[4]; + Q[4] = ss4(Q[4]) + H[5]; + Q[5] = ss0(Q[5]) + H[6]; + Q[6] = ss1(Q[6]) + H[7]; + Q[7] = ss2(Q[7]) + H[8]; + Q[8] = ss3(Q[8]) + H[9]; + Q[9] = ss4(Q[9]) + H[10]; + Q[10] = ss0(Q[10]) + H[11]; + Q[11] = ss1(Q[11]) + H[12]; + Q[12] = ss2(Q[12]) + H[13]; + Q[13] = ss3(Q[13]) + H[14]; + Q[14] = ss4(Q[14]) + H[15]; + Q[15] = ss0(Q[15]) + H[0]; + + /* This is the Message expansion or f_1 in the documentation. */ + /* It has 16 rounds. */ + /* Blue Midnight Wish has two tunable security parameters. */ + /* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */ + /* The following relation for these parameters should is satisfied: */ + /* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ +#pragma unroll + for (i = 0; i<2; i++) + Q[i + 16] = expand32_1(i + 16, M32, H, Q); + +#pragma unroll + for (i = 2; i<16; i++) + Q[i + 16] = expand32_2(i + 16, M32, H, Q); + + /* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ + /* 16 new variables that are prooduced in the Message Expansion part. */ + XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; + XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + + + /* This part is the function f_2 - in the documentation */ + + /* Compute the double chaining pipe for the next message block. */ + H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]); + H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]); + H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); + H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); + H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]); + H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]); + H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]); + H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]); + + H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]); + H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]); + H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]); + H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]); + H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]); + H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]); + H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); + H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); + +} diff --git a/kernel/credits.cl b/kernel/credits.cl new file mode 100644 index 00000000..19cbea67 --- /dev/null +++ b/kernel/credits.cl @@ -0,0 +1,232 @@ +/* +* "credits" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif + + +#define ROL32(x, n) rotate(x, (uint) n) +#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) +#define SWAP64(x) as_ulong(as_uchar8(x).s32107654) /// hmm... + +#define SHR(x, n) ((x) >> n) + +#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3)) +#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10)) + +#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10)) +#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7)) + +#define P(a,b,c,d,e,f,g,h,x,K) \ +{ \ + temp1 = h + S3(e) + F1(e,f,g) + (K + x); \ + d += temp1; h = temp1 + S2(a) + F0(a,b,c); \ +} + +#define F0(y, x, z) bitselect(z, y, z ^ x) +#define F1(x, y, z) bitselect(z, y, x) + +#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0) +#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1) +#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2) +#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3) +#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4) +#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5) +#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6) +#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7) +#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8) +#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9) +#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10) +#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11) +#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12) +#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13) +#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14) +#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15) + +#define RD14 (S1(W12) + W7 + S0(W15) + W14) +#define RD15 (S1(W13) + W8 + S0(W0) + W15) + +/// generic sha transform +inline uint8 sha256_Transform(uint16 data, uint8 state) +{ + uint temp1; + uint8 res = state; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + +#define v0 res.s0 +#define v1 res.s1 +#define v2 res.s2 +#define v3 res.s3 +#define v4 res.s4 +#define v5 res.s5 +#define v6 res.s6 +#define v7 res.s7 + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef v4 +#undef v5 +#undef v6 +#undef v7 + return (res + state); +} + + + +static __constant uint8 H256 = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, + 0xA54FF53A, 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + + +static __constant uint8 pad_data = +{ + 0x00000000, 0x00000000, 0x80000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000540 +}; + +static __constant uint8 pad_state = +{ + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100 +}; + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output,const ulong target, uint8 midstate ) +{ + + + uint nonce = get_global_id(0); + uint16 in; + uint8 state1; + + in.lo = ((__global const uint8 *)input)[4]; + in.hi = pad_data; + in.hi.s0 = ((__global const uint *)input)[40]; + in.hi.s1 = ((__global const uint *)input)[41]; + in.s3 = nonce; + state1 = sha256_Transform(in, midstate); + in.lo = state1; + in.hi = pad_state; + state1 = sha256_Transform(in,H256); + +if (SWAP64(state1.s67) <= target) + output[atomic_inc(output + 0xFF)] = nonce; + +} + diff --git a/kernel/cubehash256.cl b/kernel/cubehash256.cl new file mode 100644 index 00000000..9bc4c654 --- /dev/null +++ b/kernel/cubehash256.cl @@ -0,0 +1,132 @@ +// cubehash256 +// djm34 2015 based on ccminer cubehash512 + +#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ +#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ + + +#define LROT(x, bits) rotate( x,(uint) bits) + + +#define ROTATEUPWARDS7(a) LROT(a,7) +#define ROTATEUPWARDS11(a) LROT(a,11) + +#define SWAP(a,b) { uint u = a; a = b; b = u; } + +inline void rrounds(uint x[2][2][2][2][2]) +{ + int r; + int j; + int k; + int l; + int m; + +//#pragma unroll 2 + for (r = 0; r < CUBEHASH_ROUNDS; ++r) { + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) + SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + + } +} + + diff --git a/kernel/lyra2rev2.cl b/kernel/lyra2rev2.cl new file mode 100644 index 00000000..0fe0440d --- /dev/null +++ b/kernel/lyra2rev2.cl @@ -0,0 +1,525 @@ +/* + * Lyra2RE kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * Copyright (c) 2014 djm34 + * Copyright (c) 2014 James Lovejoy + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author djm34 + */ +// typedef unsigned int uint; +#pragma OPENCL EXTENSION cl_amd_printf : enable + +#ifndef LYRA2RE_CL +#define LYRA2RE_CL + +#if __ENDIAN_LITTLE__ +#define SPH_LITTLE_ENDIAN 1 +#else +#define SPH_BIG_ENDIAN 1 +#endif + +#define SPH_UPTR sph_u64 + +typedef unsigned int sph_u32; +typedef int sph_s32; +#ifndef __OPENCL_VERSION__ +typedef unsigned long sph_u64; +typedef long sph_s64; +#else +typedef unsigned long sph_u64; +typedef long sph_s64; +#endif + + +#define SPH_64 1 +#define SPH_64_TRUE 1 + +#define SPH_C32(x) ((sph_u32)(x ## U)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define SPH_C64(x) ((sph_u64)(x ## UL)) +#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) + +//#define SPH_ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +//#define SPH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +//#define SPH_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +//#define SPH_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) + +#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6 +#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n)) +#define SPH_ROTL64(x,n) rotate(x,(ulong)n) +#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n)) +static inline sph_u64 ror64(sph_u64 vw, unsigned a) { + uint2 result; + uint2 v = as_uint2(vw); + unsigned n = (unsigned)(64 - a); + if (n == 32) { return as_ulong((uint2)(v.y, v.x)); } + if (n < 32) { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + } + return as_ulong(result); +} + +//#define SPH_ROTR64(l,n) ror64(l,n) +#define memshift 3 +#include "blake256.cl" +#include "lyra2v2.cl" +#include "keccak1600.cl" +#include "skein256.cl" +#include "cubehash.cl" +#include "bmw256.cl" + +#define SWAP4(x) as_uint(as_uchar4(x).wzyx) +#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) +//#define SWAP8(x) as_ulong(as_uchar8(x).s32107654) +#if SPH_BIG_ENDIAN + #define DEC64E(x) (x) + #define DEC64BE(x) (*(const __global sph_u64 *) (x)); + #define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC32LE(x) (*(const __global sph_u32 *) (x)); +#else + #define DEC64E(x) SWAP8(x) + #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC64LE(x) (*(const __global sph_u64 *) (x)); +#define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x)); +#endif + +typedef union { + unsigned char h1[32]; + uint h4[8]; + ulong h8[4]; +} hash_t; + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search( + __global uchar* hashes, + // precalc hash from fisrt part of message + const uint h0, + const uint h1, + const uint h2, + const uint h3, + const uint h4, + const uint h5, + const uint h6, + const uint h7, + // last 12 bytes of original message + const uint in16, + const uint in17, + const uint in18 +) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + +// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + unsigned int h[8]; + unsigned int m[16]; + unsigned int v[16]; + + +h[0]=h0; +h[1]=h1; +h[2]=h2; +h[3]=h3; +h[4]=h4; +h[5]=h5; +h[6]=h6; +h[7]=h7; +// compress 2nd round + m[0] = in16; + m[1] = in17; + m[2] = in18; + m[3] = SWAP4(gid); + + for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];} + + for (int i = 0; i < 8; i++) {v[i] = h[i];} + + v[8] = c_u256[0]; + v[9] = c_u256[1]; + v[10] = c_u256[2]; + v[11] = c_u256[3]; + v[12] = c_u256[4] ^ 640; + v[13] = c_u256[5] ^ 640; + v[14] = c_u256[6]; + v[15] = c_u256[7]; + + for (int r = 0; r < 14; r++) { + GS(0, 4, 0x8, 0xC, 0x0); + GS(1, 5, 0x9, 0xD, 0x2); + GS(2, 6, 0xA, 0xE, 0x4); + GS(3, 7, 0xB, 0xF, 0x6); + GS(0, 5, 0xA, 0xF, 0x8); + GS(1, 6, 0xB, 0xC, 0xA); + GS(2, 7, 0x8, 0xD, 0xC); + GS(3, 4, 0x9, 0xE, 0xE); + } + + for (int i = 0; i < 16; i++) { + int j = i & 7; + h[j] ^= v[i];} + +for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);} + +barrier(CLK_LOCAL_MEM_FENCE); + +} + +// keccak256 + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global uchar* hashes) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + sph_u64 keccak_gpu_state[25]; + + for (int i = 0; i<25; i++) { + if (i<4) { keccak_gpu_state[i] = hash->h8[i]; } + else { keccak_gpu_state[i] = 0; } + } + keccak_gpu_state[4] = 0x0000000000000001; + keccak_gpu_state[16] = 0x8000000000000000; + + keccak_block(keccak_gpu_state); + for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; } +barrier(CLK_LOCAL_MEM_FENCE); + + + +} + +// cubehash256 + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global uchar* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71; + sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63; + sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696; + sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C; + sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00; + sph_u32 xf = 0x40E5FBAB; + + sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5; + sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549; + sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285; + sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD; + sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6; + sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A; + + x0 ^= (hash->h4[0]); + x1 ^= (hash->h4[1]); + x2 ^= (hash->h4[2]); + x3 ^= (hash->h4[3]); + x4 ^= (hash->h4[4]); + x5 ^= (hash->h4[5]); + x6 ^= (hash->h4[6]); + x7 ^= (hash->h4[7]); + + + SIXTEEN_ROUNDS; + x0 ^= 0x80; + SIXTEEN_ROUNDS; + xv ^= 0x01; + for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS; + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + + + barrier(CLK_GLOBAL_MEM_FENCE); + +} + + +/// lyra2 algo + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global uchar* hashes,__global uchar* matrix ) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong4 *DMatrix = (__global ulong4 *)(matrix + (4 * memshift * 4 * 4 * 8 * (get_global_id(0) % MAX_GLOBAL_THREADS))); + +// uint offset = (4 * memshift * 4 * 4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))/32; + ulong4 state[4]; + + state[0].x = hash->h8[0]; //password + state[0].y = hash->h8[1]; //password + state[0].z = hash->h8[2]; //password + state[0].w = hash->h8[3]; //password + state[1] = state[0]; + state[2] = (ulong4)(0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL, 0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL); + state[3] = (ulong4)(0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL, 0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL); + for (int i = 0; i<12; i++) { round_lyra(state); } + + state[0] ^= (ulong4)(0x20,0x20,0x20,0x01); + state[1] ^= (ulong4)(0x04,0x04,0x80,0x0100000000000000); + + for (int i = 0; i<12; i++) { round_lyra(state); } + + + uint ps1 = (memshift * 3); +//#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 - memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix)[j+s1] = state[j]; + + round_lyra(state); + } + + reduceDuplexf(state,DMatrix); + + reduceDuplexRowSetupf(1, 0, 2,state, DMatrix); + reduceDuplexRowSetupf(2, 1, 3, state,DMatrix); + + + uint rowa; + uint prev = 3; + for (uint i = 0; i<4; i++) { + rowa = state[0].x & 3; + reduceDuplexRowf(prev, rowa, i, state, DMatrix); + prev = i; + } + + + + uint shift = (memshift * 4 * rowa); + + for (int j = 0; j < 3; j++) + state[j] ^= (DMatrix)[j+shift]; + + for (int i = 0; i < 12; i++) + round_lyra(state); +////////////////////////////////////// + + + for (int i = 0; i<4; i++) {hash->h8[i] = ((ulong*)state)[i];} +barrier(CLK_LOCAL_MEM_FENCE); + + + +} + +//skein256 + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search4(__global uchar* hashes) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + sph_u64 h[9]; + sph_u64 t[3]; + sph_u64 dt0,dt1,dt2,dt3; + sph_u64 p0, p1, p2, p3, p4, p5, p6, p7; + h[8] = skein_ks_parity; + + for (int i = 0; i<8; i++) { + h[i] = SKEIN_IV512_256[i]; + h[8] ^= h[i];} + + t[0]=t12[0]; + t[1]=t12[1]; + t[2]=t12[2]; + + dt0=hash->h8[0]; + dt1=hash->h8[1]; + dt2=hash->h8[2]; + dt3=hash->h8[3]; + + p0 = h[0] + dt0; + p1 = h[1] + dt1; + p2 = h[2] + dt2; + p3 = h[3] + dt3; + p4 = h[4]; + p5 = h[5] + t[0]; + p6 = h[6] + t[1]; + p7 = h[7]; + + #pragma unroll + for (int i = 1; i<19; i+=2) {Round_8_512(p0,p1,p2,p3,p4,p5,p6,p7,i);} + p0 ^= dt0; + p1 ^= dt1; + p2 ^= dt2; + p3 ^= dt3; + + h[0] = p0; + h[1] = p1; + h[2] = p2; + h[3] = p3; + h[4] = p4; + h[5] = p5; + h[6] = p6; + h[7] = p7; + h[8] = skein_ks_parity; + + for (int i = 0; i<8; i++) { h[8] ^= h[i]; } + + t[0] = t12[3]; + t[1] = t12[4]; + t[2] = t12[5]; + p5 += t[0]; //p5 already equal h[5] + p6 += t[1]; + + #pragma unroll + for (int i = 1; i<19; i+=2) { Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i); } + + hash->h8[0] = p0; + hash->h8[1] = p1; + hash->h8[2] = p2; + hash->h8[3] = p3; + barrier(CLK_LOCAL_MEM_FENCE); + +} + +//cubehash + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search5(__global uchar* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71; + sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63; + sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696; + sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C; + sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00; + sph_u32 xf = 0x40E5FBAB; + + sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5; + sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549; + sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285; + sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD; + sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6; + sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A; + + x0 ^= (hash->h4[0]); + x1 ^= (hash->h4[1]); + x2 ^= (hash->h4[2]); + x3 ^= (hash->h4[3]); + x4 ^= (hash->h4[4]); + x5 ^= (hash->h4[5]); + x6 ^= (hash->h4[6]); + x7 ^= (hash->h4[7]); + + + SIXTEEN_ROUNDS; + x0 ^= 0x80; + SIXTEEN_ROUNDS; + xv ^= 0x01; + for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS; + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + + + barrier(CLK_GLOBAL_MEM_FENCE); + +} + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search6(__global uchar* hashes, __global uint* output, const ulong target) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + uint dh[16] = { + 0x40414243, 0x44454647, + 0x48494A4B, 0x4C4D4E4F, + 0x50515253, 0x54555657, + 0x58595A5B, 0x5C5D5E5F, + 0x60616263, 0x64656667, + 0x68696A6B, 0x6C6D6E6F, + 0x70717273, 0x74757677, + 0x78797A7B, 0x7C7D7E7F + }; + uint final_s[16] = { + 0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, + 0xaaaaaaa3, 0xaaaaaaa4, 0xaaaaaaa5, + 0xaaaaaaa6, 0xaaaaaaa7, 0xaaaaaaa8, + 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab, + 0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, + 0xaaaaaaaf + }; + + uint message[16]; + for (int i = 0; i<8; i++) message[i] = hash->h4[i]; + for (int i = 9; i<14; i++) message[i] = 0; + message[8]= 0x80; + message[14]=0x100; + message[15]=0; + + Compression256(message, dh); + Compression256(dh, final_s); + barrier(CLK_LOCAL_MEM_FENCE); + + + bool result = ( ((ulong*)final_s)[7] <= target); + if (result) { + output[atomic_inc(output + 0xFF)] = SWAP4(gid); + } + +} + + +#endif // LYRA2RE_CL \ No newline at end of file diff --git a/kernel/lyra2v2.cl b/kernel/lyra2v2.cl new file mode 100644 index 00000000..be0f1f28 --- /dev/null +++ b/kernel/lyra2v2.cl @@ -0,0 +1,184 @@ +/* +* Lyra2 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2014 djm34 +* +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + + + +#define ROTL64(x,n) rotate(x,(ulong)n) +#define ROTR64(x,n) rotate(x,(ulong)(64-n)) +#define SWAP32(x) as_ulong(as_uint2(x).s10) +#define SWAP24(x) as_ulong(as_uchar8(x).s34567012) +#define SWAP16(x) as_ulong(as_uchar8(x).s23456701) + +#define G(a,b,c,d) \ + do { \ + a += b; d ^= a; d = SWAP32(d); \ + c += d; b ^= c; b = ROTR64(b,24); \ + a += b; d ^= a; d = ROTR64(d,16); \ + c += d; b ^= c; b = ROTR64(b, 63); \ +\ + } while (0) + +#define G_old(a,b,c,d) \ + do { \ + a += b; d ^= a; d = ROTR64(d, 32); \ + c += d; b ^= c; b = ROTR64(b, 24); \ + a += b; d ^= a; d = ROTR64(d, 16); \ + c += d; b ^= c; b = ROTR64(b, 63); \ +\ + } while (0) + + +/*One Round of the Blake2b's compression function*/ + +#define round_lyra(s) \ + do { \ + G(s[0].x, s[1].x, s[2].x, s[3].x); \ + G(s[0].y, s[1].y, s[2].y, s[3].y); \ + G(s[0].z, s[1].z, s[2].z, s[3].z); \ + G(s[0].w, s[1].w, s[2].w, s[3].w); \ + G(s[0].x, s[1].y, s[2].z, s[3].w); \ + G(s[0].y, s[1].z, s[2].w, s[3].x); \ + G(s[0].z, s[1].w, s[2].x, s[3].y); \ + G(s[0].w, s[1].x, s[2].y, s[3].z); \ + } while(0) + + + +void reduceDuplexf(ulong4* state ,__global ulong4* DMatrix) +{ + + ulong4 state1[3]; + uint ps1 = 0; + uint ps2 = (memshift * 3 + memshift * 4); +//#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 - i*memshift; + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state[j] ^= state1[j]; + round_lyra(state); + for (int j = 0; j < 3; j++) state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state1[j]; + } + +} + + + +void reduceDuplexRowf(uint rowIn,uint rowInOut,uint rowOut,ulong4 * state, __global ulong4 * DMatrix) +{ + +ulong4 state1[3], state2[3]; +uint ps1 = (memshift * 4 * rowIn); +uint ps2 = (memshift * 4 * rowInOut); +uint ps3 = (memshift * 4 * rowOut); + + + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 + i*memshift; + uint s3 = ps3 + i*memshift; + + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2]; + + for (int j = 0; j < 3; j++) state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) state[j] ^= state1[j]; + + + round_lyra(state); + + ((ulong*)state2)[0] ^= ((ulong*)state)[11]; + for (int j = 0; j < 11; j++) + ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j]; + + if (rowInOut != rowOut) { + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s3] ^= state[j]; + } + else { + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + } + + } + } + + + + +void reduceDuplexRowSetupf(uint rowIn, uint rowInOut, uint rowOut, ulong4 *state, __global ulong4* DMatrix) { + + ulong4 state2[3], state1[3]; + uint ps1 = (memshift * 4 * rowIn); + uint ps2 = (memshift * 4 * rowInOut); + uint ps3 = (memshift * 3 + memshift * 4 * rowOut); + + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 + i*memshift; + uint s3 = ps3 - i*memshift; + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2]; + for (int j = 0; j < 3; j++) { + ulong4 tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + round_lyra(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix)[j + s3] = state1[j]; + } + + ((ulong*)state2)[0] ^= ((ulong*)state)[11]; + for (int j = 0; j < 11; j++) + ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + } + } + diff --git a/kernel/yescrypt-multi.cl b/kernel/yescrypt-multi.cl new file mode 100644 index 00000000..3af7b28a --- /dev/null +++ b/kernel/yescrypt-multi.cl @@ -0,0 +1,314 @@ +/* +* "yescrypt" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif + +#include "yescrypt_essential.cl" + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, __global uchar* buff1, __global uchar* buff2, __global uchar* buff3, const uint target) +{ + + __global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS))); + __global uint8 *sha256tokeep = (__global uint8 *)(buff3 + (8 * sizeof(uint)*(get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + + uint nonce = (get_global_id(0)); + uint data[20]; + uint16 in; + uint8 state1, state2; +// uint8 sha256tokeep; + +// ulong16 Bdev[8]; // will require an additional buffer + ((uint16 *)data)[0] = ((__global const uint16 *)input)[0]; + ((uint4 *)data)[4] = ((__global const uint4 *)input)[4]; +// for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); } + // if (nonce == 10) { printf("data %08x %08x\n", data[0], data[1]); } + uint8 passwd = sha256_80(data, nonce); + //pbkdf + in.lo = pad1.lo ^ passwd; + in.hi = pad1.hi; + state1 = sha256_Transform(in, H256); + + in.lo = pad2.lo ^ passwd; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + + in = ((uint16*)data)[0]; + state1 = sha256_Transform(in, state1); +#pragma unroll 1 + for (int i = 0; i<8; i++) + { + uint16 result; + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 1; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.lo = swapvec(sha256_Transform(in, state2)); + if (i == 0) sha256tokeep[0] = result.lo; + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 2; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.hi = swapvec(sha256_Transform(in, state2)); + Bdev[i].lo = as_ulong8(shuffle(result)); +// Bdev[i].lo = as_ulong8(result); + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 3; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.lo = swapvec(sha256_Transform(in, state2)); + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 4; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.hi = swapvec(sha256_Transform(in, state2)); + + + Bdev[i].hi = as_ulong8(shuffle(result)); +// Bdev[i].hi = as_ulong8(result); + } + + //mixing1 + + prevstate[0] = Bdev[0]; + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + prevstate[1] = Bdev[0]; + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + + uint n = 1; +#pragma unroll 1 + for (uint i = 2; i < 64; i++) + { + + prevstate[i] = Bdev[0]; + + if ((i&(i - 1)) == 0) n = n << 1; + + uint j = as_uint2(Bdev[0].hi.s0).x & (n - 1); + + j += i - n; + Bdev[0] ^= prevstate[j]; + + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + } + + +} + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global uchar *buffer1, __global uchar *buffer2) +{ +} + + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global uchar *padcache, __global uchar *buff1, __global uchar *buff2) +{ + + __global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16* prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + for (int i = 0; i<8; i++) + hashbuffer[i] = Bdev[i]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + + + for (int i = 0; i<8; i++) + hashbuffer[i + 8] = Bdev[i]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + int n = 1; +#pragma unroll 1 + for (int i = 2; i < 2048; i ++) + { + + for (int k = 0; k<8; k++) + (hashbuffer + 8 * i)[k] = Bdev[k]; + + + if ((i&(i - 1)) == 0) n = n << 1; + + uint j = as_uint2(Bdev[7].hi.s0).x & (n - 1); + j += i - n; + + for (int k = 0; k < 8; k++) + Bdev[k] ^= (hashbuffer + 8 * j)[k]; + + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + } +} + +/* +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global uchar *buffer1, __global uchar *buffer2) +{ +} +*/ + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global uchar *padcache, __global uchar *buff1, __global uchar *buff2) +{ + + __global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16* prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + +#pragma unroll 1 + for (int z = 0; z < 684; z++) + { + + uint j = as_uint2(Bdev[7].hi.s0).x & 2047; + + + for (int k = 0; k < 8; k++) + Bdev[k] ^= (hashbuffer + 8 * j)[k]; + + if (z<682) + for (int k = 0; k<8; k++) + (hashbuffer + 8 * j)[k] = Bdev[k]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); +//// + } + +} + +/* +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search5(__global uchar *buffer1, __global uchar *buffer2) +{ +} +*/ + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search4(__global const uchar* restrict input, __global uint* restrict output, __global uchar *buff2,__global uchar* buff3, const uint target) +{ + + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global uint8 *sha256tokeep = (__global uint8 *)(buff3 + (8 * sizeof(uint)*(get_global_id(0) % MAX_GLOBAL_THREADS))); + + uint nonce = (get_global_id(0)); + + + uint data[20]; + ((uint16 *)data)[0] = ((__global const uint16 *)input)[0]; + ((uint4 *)data)[4] = ((__global const uint4 *)input)[4]; +// for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); } + uint8 swpass = swapvec(sha256tokeep[0]); + uint16 in; + uint8 state1,state2; + in.lo = pad1.lo ^ swpass; + in.hi = pad1.hi; + + + state1 = sha256_Transform(in, H256); + + in.lo = pad2.lo ^ swpass; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + +#pragma unroll 1 + for (int i = 0; i<8; i++) { + in = unshuffle(Bdev[i].lo); + in = swapvec16(in); + state1 = sha256_Transform(in, state1); + in = unshuffle(Bdev[i].hi); + in = swapvec16(in); + state1 = sha256_Transform(in, state1); + } + in = pad5; + state1 = sha256_Transform(in, state1); + in.lo = state1; + in.hi = pad4; + uint8 res = sha256_Transform(in, state2); + + //hmac and final sha + + in.lo = pad1.lo ^ res; + in.hi = pad1.hi; + state1 = sha256_Transform(in, H256); + in.lo = pad2.lo ^ res; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + in = ((uint16*)data)[0]; + state1 = sha256_Transform(in, state1); + in = padsha80; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = get_global_id(0); + in.sf = 0x480; + state1 = sha256_Transform(in, state1); + in.lo = state1; + in.hi = pad4; + state1 = sha256_Transform(in, state2); + // state2 = H256; + in.lo = state1; + in.hi = pad4; + in.sf = 0x100; + res = sha256_Transform(in, H256); + + + if (SWAP32(res.s7) <= (target)) + output[atomic_inc(output + 0xFF)] = (nonce); + +} diff --git a/kernel/yescrypt.cl b/kernel/yescrypt.cl new file mode 100644 index 00000000..0a94ebca --- /dev/null +++ b/kernel/yescrypt.cl @@ -0,0 +1,253 @@ +/* +* "yescrypt" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ +#if !defined(cl_khr_byte_addressable_store) +#error "Device does not support unaligned stores" +#endif + +#include "yescrypt_essential.cl" + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, __global uchar* buff1, __global uchar* buff2, const uint target) +{ + + __global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + + uint nonce = (get_global_id(0)); + uint data[20]; + uint16 in; + uint8 state1, state2; + uint8 sha256tokeep; + + ((uint16 *)data)[0] = ((__global const uint16 *)input)[0]; + ((uint4 *)data)[4] = ((__global const uint4 *)input)[4]; + for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); } + // if (nonce == 10) { printf("data %08x %08x\n", data[0], data[1]); } + uint8 passwd = sha256_80(data, nonce); + //pbkdf + in.lo = pad1.lo ^ passwd; + in.hi = pad1.hi; + state1 = sha256_Transform(in, H256); + + in.lo = pad2.lo ^ passwd; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + + in = ((uint16*)data)[0]; + state1 = sha256_Transform(in, state1); +#pragma unroll 1 + for (int i = 0; i<8; i++) + { + uint16 result; + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 1; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.lo = swapvec(sha256_Transform(in, state2)); + if (i == 0) sha256tokeep = result.lo; + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 2; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.hi = swapvec(sha256_Transform(in, state2)); + Bdev[i].lo = as_ulong8(shuffle(result)); + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 3; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.lo = swapvec(sha256_Transform(in, state2)); + in = pad3; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = nonce; + in.s4 = 4 * i + 4; + in.lo = sha256_Transform(in, state1); + in.hi = pad4; + result.hi = swapvec(sha256_Transform(in, state2)); + + + Bdev[i].hi = as_ulong8(shuffle(result)); + } + + //mixing1 + + prevstate[0] = Bdev[0]; + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + prevstate[1] = Bdev[0]; + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + + uint n = 1; +#pragma unroll 1 + for (uint i = 2; i < 64; i++) + { + + prevstate[i] = Bdev[0]; + + if ((i&(i - 1)) == 0) n = n << 1; + + uint j = as_uint2(Bdev[0].hi.s0).x & (n - 1); + + j += i - n; + Bdev[0] ^= prevstate[j]; + + Bdev[0] = blockmix_salsa8_small2(Bdev[0]); + } + + + for (int i = 0; i<8; i++) + hashbuffer[i] = Bdev[i]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + + + for (int i = 0; i<8; i++) + hashbuffer[i + 8] = Bdev[i]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + n = 1; +#pragma unroll 1 + for (int i = 2; i < 2048; i++) + { + + for (int k = 0; k<8; k++) + (hashbuffer + 8 * i)[k] = Bdev[k]; + + + if ((i&(i - 1)) == 0) n = n << 1; + + uint j = as_uint2(Bdev[7].hi.s0).x & (n - 1); + j += i - n; + + for (int k = 0; k < 8; k++) + Bdev[k] ^= (hashbuffer + 8 * j)[k]; + + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + } + + +#pragma unroll 1 + for (int z = 0; z < 684; z++) + { + + uint j = as_uint2(Bdev[7].hi.s0).x & 2047; + + + for (int k = 0; k < 8; k++) + Bdev[k] ^= (hashbuffer + 8 * j)[k]; + + if (z<682) + for (int k = 0; k<8; k++) + (hashbuffer + 8 * j)[k] = Bdev[k]; + + blockmix_pwxform((__global ulong8*)Bdev, prevstate); + //// + } + + + + uint8 swpass = swapvec(sha256tokeep); +// uint16 in; +// uint8 state1, state2; + in.lo = pad1.lo ^ swpass; + in.hi = pad1.hi; + + + state1 = sha256_Transform(in, H256); + + in.lo = pad2.lo ^ swpass; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + +#pragma unroll 1 + for (int i = 0; i<8; i++) { + in = unshuffle(Bdev[i].lo); + in = swapvec16(in); + state1 = sha256_Transform(in, state1); + in = unshuffle(Bdev[i].hi); + in = swapvec16(in); + state1 = sha256_Transform(in, state1); + } + in = pad5; + state1 = sha256_Transform(in, state1); + in.lo = state1; + in.hi = pad4; + uint8 res = sha256_Transform(in, state2); + + //hmac and final sha + + in.lo = pad1.lo ^ res; + in.hi = pad1.hi; + state1 = sha256_Transform(in, H256); + in.lo = pad2.lo ^ res; + in.hi = pad2.hi; + state2 = sha256_Transform(in, H256); + in = ((uint16*)data)[0]; + state1 = sha256_Transform(in, state1); + in = padsha80; + in.s0 = data[16]; + in.s1 = data[17]; + in.s2 = data[18]; + in.s3 = get_global_id(0); + in.sf = 0x480; + state1 = sha256_Transform(in, state1); + in.lo = state1; + in.hi = pad4; + state1 = sha256_Transform(in, state2); + // state2 = H256; + in.lo = state1; + in.hi = pad4; + in.sf = 0x100; + res = sha256_Transform(in, H256); + + + if (SWAP32(res.s7) <= (target)) + output[atomic_inc(output + 0xFF)] = (nonce); + +} + diff --git a/kernel/yescrypt_essential.cl b/kernel/yescrypt_essential.cl new file mode 100644 index 00000000..ba1816a8 --- /dev/null +++ b/kernel/yescrypt_essential.cl @@ -0,0 +1,760 @@ +/* +* "yescrypt" kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* +* Copyright (c) 2015 djm34 +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + +#define ROL32(x, n) rotate(x, (uint) n) +#define SWAP32(a) (as_uint(as_uchar4(a).wzyx)) +//#define ROL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +#define HASH_MEMORY 4096 + + +#define SALSA(a,b,c,d) do { \ + t =a+d; b^=ROL32(t, 7U); \ + t =b+a; c^=ROL32(t, 9U); \ + t =c+b; d^=ROL32(t, 13U); \ + t =d+c; a^=ROL32(t, 18U); \ +} while(0) + + +#define SALSA_CORE(state) do { \ +\ +SALSA(state.s0,state.s4,state.s8,state.sc); \ +SALSA(state.s5,state.s9,state.sd,state.s1); \ +SALSA(state.sa,state.se,state.s2,state.s6); \ +SALSA(state.sf,state.s3,state.s7,state.sb); \ +SALSA(state.s0,state.s1,state.s2,state.s3); \ +SALSA(state.s5,state.s6,state.s7,state.s4); \ +SALSA(state.sa,state.sb,state.s8,state.s9); \ +SALSA(state.sf,state.sc,state.sd,state.se); \ + } while(0) + +#define uSALSA_CORE(state) do { \ +\ +SALSA(state.s0,state.s4,state.s8,state.sc); \ +SALSA(state.s1,state.s5,state.s9,state.sd); \ +SALSA(state.s2,state.s6,state.sa,state.se); \ +SALSA(state.s3,state.s7,state.sb,state.sf); \ +SALSA(state.s0,state.sd,state.sa,state.s7); \ +SALSA(state.s1,state.se,state.sb,state.s4); \ +SALSA(state.s2,state.sf,state.s8,state.s5); \ +SALSA(state.s3,state.sc,state.s9,state.s6); \ +} while(0) + + +#define unshuffle(state) (as_uint16(state).s0da741eb852fc963) + +#define shuffle(state) (as_uint16(state).s05af49e38d27c16b) + +static __constant uint16 pad1 = +{ + 0x36363636, 0x36363636, 0x36363636, 0x36363636, + 0x36363636, 0x36363636, 0x36363636, 0x36363636, + 0x36363636, 0x36363636, 0x36363636, 0x36363636, + 0x36363636, 0x36363636, 0x36363636, 0x36363636 +}; + +static __constant uint16 pad2 = +{ + 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, + 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, + 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, + 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c +}; + +static __constant uint16 pad5 = +{ + 0x00000001, 0x80000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00002220 +}; + +static __constant uint16 pad3 = +{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x80000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x000004a0 +}; + +static __constant uint16 padsha80 = +{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000280 +}; + +static __constant uint8 pad4 = +{ + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000300 +}; + + + +static __constant uint8 H256 = { + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, + 0xA54FF53A, 0x510E527F, 0x9B05688C, + 0x1F83D9AB, 0x5BE0CD19 +}; + +inline uint8 swapvec(uint8 buf) +{ + uint8 vec; + vec.s0 = SWAP32(buf.s0); + vec.s1 = SWAP32(buf.s1); + vec.s2 = SWAP32(buf.s2); + vec.s3 = SWAP32(buf.s3); + vec.s4 = SWAP32(buf.s4); + vec.s5 = SWAP32(buf.s5); + vec.s6 = SWAP32(buf.s6); + vec.s7 = SWAP32(buf.s7); + return vec; +} + + + +inline uint16 swapvec16(uint16 buf) +{ + uint16 vec; + vec.s0 = SWAP32(buf.s0); + vec.s1 = SWAP32(buf.s1); + vec.s2 = SWAP32(buf.s2); + vec.s3 = SWAP32(buf.s3); + vec.s4 = SWAP32(buf.s4); + vec.s5 = SWAP32(buf.s5); + vec.s6 = SWAP32(buf.s6); + vec.s7 = SWAP32(buf.s7); + vec.s8 = SWAP32(buf.s8); + vec.s9 = SWAP32(buf.s9); + vec.sa = SWAP32(buf.sa); + vec.sb = SWAP32(buf.sb); + vec.sc = SWAP32(buf.sc); + vec.sd = SWAP32(buf.sd); + vec.se = SWAP32(buf.se); + vec.sf = SWAP32(buf.sf); + return vec; +} + + ulong8 salsa20_8(uint16 Bx) +{ +uint t; + uint16 st = Bx; + uSALSA_CORE(st); + uSALSA_CORE(st); + uSALSA_CORE(st); + uSALSA_CORE(st); + return(as_ulong8(st + Bx)); +} + + ulong8 salsa20_8n(uint16 Bx) + { + uint t; + uint16 st = Bx; + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + return(as_ulong8(st + Bx)); + } + + + ulong16 blockmix_salsa8_small2(ulong16 Bin) +{ + ulong8 X = Bin.hi; + X ^= Bin.lo; + X = salsa20_8(as_uint16(X)); + Bin.lo = X; + X ^= Bin.hi; + X = salsa20_8(as_uint16(X)); + Bin.hi = X; + return(Bin); +} +/* + uint16 salsa20_8_2(uint16 Bx) + { + uint t; + uint16 st = Bx; + uSALSA_CORE(st); + uSALSA_CORE(st); + uSALSA_CORE(st); + uSALSA_CORE(st); + return(st + Bx); + } + + ulong16 blockmix_salsa8_small2(ulong16 Bin) + { + uint16 X = as_uint16(Bin.hi); + X ^= as_uint16(Bin.lo); + X = salsa20_8_2(as_uint16(X)); + Bin.lo = as_ulong8(X); + X ^= as_uint16(Bin.hi); + X = salsa20_8_2(as_uint16(X)); + Bin.hi = as_ulong8(X); + return(Bin); + } +*/ + + +inline ulong2 madd4long2(uint4 a, uint4 b) +{ + uint4 result; + result.x = a.x*a.y + b.x; + result.y = b.y + mad_hi(a.x, a.y, b.x); + result.z = a.z*a.w + b.z; + result.w = b.w + mad_hi(a.z, a.w, b.z); + return as_ulong2(result); +} + +inline ulong2 madd4long3(uint4 a, ulong2 b) +{ + ulong2 result; + result.x = (ulong)a.x*(ulong)a.y + b.x; + result.y = (ulong)a.z*(ulong)a.w + b.y; + return result; +} + + +inline ulong8 block_pwxform_long_old(ulong8 Bout, __global ulong16 *prevstate) +{ + + ulong2 vec = Bout.lo.lo; + + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = ((__global ulong2*)(prevstate ))[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = ((__global ulong2*)(prevstate + 32))[x.y]; + + vec ^= p1; + } + Bout.lo.lo = vec; + vec = Bout.lo.hi; + for (int i = 0; i < 6; i++) + { + + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = ((__global ulong2*)(prevstate))[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = ((__global ulong2*)(prevstate + 32))[x.y]; + + vec ^= p1; + } + Bout.lo.hi = vec; + + vec = Bout.hi.lo; + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = ((__global ulong2*)(prevstate))[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = ((__global ulong2*)(prevstate + 32))[x.y]; + vec ^= p1; + } + Bout.hi.lo = vec; + vec = Bout.hi.hi; + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = ((__global ulong2*)(prevstate))[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = ((__global ulong2*)(prevstate + 32))[x.y]; + + vec ^= p1; + } + Bout.hi.hi = vec; + + return(Bout); +} + +inline ulong8 block_pwxform_long(ulong8 Bout, __global ulong2 *prevstate) +{ + + ulong2 vec = Bout.lo.lo; + + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = prevstate[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = (prevstate + 32*8)[x.y]; + + vec ^= p1; + } + Bout.lo.lo = vec; + vec = Bout.lo.hi; + for (int i = 0; i < 6; i++) + { + + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = prevstate[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = (prevstate + 32 * 8)[x.y]; + + vec ^= p1; + } + Bout.lo.hi = vec; + + vec = Bout.hi.lo; + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = prevstate[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = (prevstate + 32 * 8)[x.y]; + vec ^= p1; + } + Bout.hi.lo = vec; + vec = Bout.hi.hi; + for (int i = 0; i < 6; i++) + { + ulong2 p0, p1; + uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF); + p0 = prevstate[x.x]; + vec = madd4long3(as_uint4(vec), p0); + p1 = (prevstate + 32 * 8)[x.y]; + + vec ^= p1; + } + Bout.hi.hi = vec; + + return(Bout); +} + + + + +inline void blockmix_pwxform(__global ulong8 *Bin, __global ulong16 *prevstate) +{ + Bin[0] ^= Bin[15]; + Bin[0] = block_pwxform_long_old(Bin[0], prevstate); +#pragma unroll 1 + for (int i = 1; i < 16; i++) + { + Bin[i] ^= Bin[i - 1]; + Bin[i] = block_pwxform_long_old(Bin[i], prevstate); + } + Bin[15] = salsa20_8(as_uint16(Bin[15])); +} + +#define SHR(x, n) ((x) >> n) + + +#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3)) +#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10)) + +#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10)) +#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7)) + +#define P(a,b,c,d,e,f,g,h,x,K) \ +{ \ + temp1 = h + S3(e) + F1(e,f,g) + (K + x); \ + d += temp1; h = temp1 + S2(a) + F0(a,b,c); \ +} + +#define PLAST(a,b,c,d,e,f,g,h,x,K) \ +{ \ + d += h + S3(e) + F1(e,f,g) + (x + K); \ +} + +#define F0(y, x, z) bitselect(z, y, z ^ x) +#define F1(x, y, z) bitselect(z, y, x) + +#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0) +#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1) +#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2) +#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3) +#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4) +#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5) +#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6) +#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7) +#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8) +#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9) +#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10) +#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11) +#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12) +#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13) +#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14) +#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15) + +#define RD14 (S1(W12) + W7 + S0(W15) + W14) +#define RD15 (S1(W13) + W8 + S0(W0) + W15) + +/// generic sha transform +inline uint8 sha256_Transform(uint16 data, uint8 state) +{ +uint temp1; + uint8 res = state; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + +#define v0 res.s0 +#define v1 res.s1 +#define v2 res.s2 +#define v3 res.s3 +#define v4 res.s4 +#define v5 res.s5 +#define v6 res.s6 +#define v7 res.s7 + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef v4 +#undef v5 +#undef v6 +#undef v7 + return (res+state); +} + + +static inline uint8 sha256_round1(uint16 data) +{ + uint temp1; + uint8 res; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + + uint v0 = 0x6A09E667; + uint v1 = 0xBB67AE85; + uint v2 = 0x3C6EF372; + uint v3 = 0xA54FF53A; + uint v4 = 0x510E527F; + uint v5 = 0x9B05688C; + uint v6 = 0x1F83D9AB; + uint v7 = 0x5BE0CD19; + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); + + res.s0 = v0 + 0x6A09E667; + res.s1 = v1 + 0xBB67AE85; + res.s2 = v2 + 0x3C6EF372; + res.s3 = v3 + 0xA54FF53A; + res.s4 = v4 + 0x510E527F; + res.s5 = v5 + 0x9B05688C; + res.s6 = v6 + 0x1F83D9AB; + res.s7 = v7 + 0x5BE0CD19; + return (res); +} + + +static inline uint8 sha256_round2(uint16 data,uint8 buf) +{ + uint temp1; + uint8 res; + uint W0 = data.s0; + uint W1 = data.s1; + uint W2 = data.s2; + uint W3 = data.s3; + uint W4 = data.s4; + uint W5 = data.s5; + uint W6 = data.s6; + uint W7 = data.s7; + uint W8 = data.s8; + uint W9 = data.s9; + uint W10 = data.sA; + uint W11 = data.sB; + uint W12 = data.sC; + uint W13 = data.sD; + uint W14 = data.sE; + uint W15 = data.sF; + + uint v0 = buf.s0; + uint v1 = buf.s1; + uint v2 = buf.s2; + uint v3 = buf.s3; + uint v4 = buf.s4; + uint v5 = buf.s5; + uint v6 = buf.s6; + uint v7 = buf.s7; + + P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491); + P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF); + P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5); + P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B); + P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1); + P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4); + P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5); + P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98); + P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01); + P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE); + P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3); + P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74); + P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE); + P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7); + P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624); + P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585); + P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070); + + P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116); + P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08); + P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C); + P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5); + P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3); + P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A); + P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F); + P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3); + P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE); + P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F); + P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814); + P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208); + P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA); + P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB); + P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7); + P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2); + + res.s0 = (v0 + buf.s0); + res.s1 = (v1 + buf.s1); + res.s2 = (v2 + buf.s2); + res.s3 = (v3 + buf.s3); + res.s4 = (v4 + buf.s4); + res.s5 = (v5 + buf.s5); + res.s6 = (v6 + buf.s6); + res.s7 = (v7 + buf.s7); + return (res); +} + +static inline uint8 sha256_80(uint* data,uint nonce) +{ + +uint8 buf = sha256_round1( ((uint16*)data)[0]); +uint16 in = padsha80; +in.s0 = data[16]; +in.s1 = data[17]; +in.s2 = data[18]; +in.s3 = nonce; + +return(sha256_round2(in,buf)); +} + diff --git a/miner.h b/miner.h index 45985779..ecaa6a13 100644 --- a/miner.h +++ b/miner.h @@ -733,6 +733,17 @@ static inline void flip128(void *dest_p, const void *src_p) dest[i] = swab32(src[i]); } +static inline void flip168(void *dest_p, const void *src_p) +{ + uint32_t *dest = (uint32_t *)dest_p; + const uint32_t *src = (uint32_t *)src_p; + int i; + + for (i = 0; i < 42; i++) + dest[i] = swab32(src[i]); +} + + /* For flipping to the correct endianness if necessary */ #if defined(__BIG_ENDIAN__) || defined(MIPSEB) static inline void endian_flip32(void *dest_p, const void *src_p) @@ -744,6 +755,11 @@ static inline void endian_flip128(void *dest_p, const void *src_p) { flip128(dest_p, src_p); } +static inline void endian_flip168(void *dest_p, const void *src_p) +{ + flip168(dest_p, src_p); +} + #else static inline void endian_flip32(void __maybe_unused *dest_p, const void __maybe_unused *src_p) @@ -754,8 +770,13 @@ static inline void endian_flip128(void __maybe_unused *dest_p, const void __maybe_unused *src_p) { } +static inline void +endian_flip168(void __maybe_unused *dest_p, const void __maybe_unused *src_p) +{ +} #endif + extern double cgpu_runtime(struct cgpu_info *cgpu); extern void _quit(int status); @@ -1146,8 +1167,8 @@ extern bool add_pool_details(struct pool *pool, bool live, char *url, char *user #define MAX_GPUDEVICES 16 #define MAX_DEVICES 4096 -#define MIN_INTENSITY 8 -#define MIN_INTENSITY_STR "8" +#define MIN_INTENSITY 4 +#define MIN_INTENSITY_STR "4" #define MAX_INTENSITY 31 #define MAX_INTENSITY_STR "31" #define MIN_XINTENSITY 1 @@ -1416,7 +1437,7 @@ struct pool { #define GETWORK_MODE_GBT 'G' struct work { - unsigned char data[128]; + unsigned char data[168]; unsigned char midstate[32]; unsigned char target[32]; unsigned char hash[32]; diff --git a/ocl.c b/ocl.c index 65e34e75..bd8a1527 100644 --- a/ocl.c +++ b/ocl.c @@ -36,6 +36,8 @@ #include "ocl/binary_kernel.h" #include "algorithm/neoscrypt.h" #include "algorithm/pluck.h" +#include "algorithm/yescrypt.h" +#include "algorithm/lyra2re.h" /* FIXME: only here for global config vars, replace with configuration.h * or similar as soon as config is in a struct instead of littered all @@ -414,8 +416,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } - /////////////////////////////////// pluck - // neoscrypt TC + // pluck TC else if (!safe_cmp(cgpu->algorithm.name, "pluck") && !cgpu->opt_tc) { size_t glob_thread_count; long max_int; @@ -497,7 +498,175 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg cgpu->thread_concurrency = glob_thread_count; applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); + } + + // Yescrypt TC + else if ((!safe_cmp(cgpu->algorithm.name, "yescrypt") || + !safe_cmp(algorithm->name, "yescrypt-multi")) && !cgpu->opt_tc) { + size_t glob_thread_count; + long max_int; + unsigned char type = 0; + + // determine which intensity type to use + if (cgpu->rawintensity > 0) { + glob_thread_count = cgpu->rawintensity; + max_int = glob_thread_count; + type = 2; + } + else if (cgpu->xintensity > 0) { + glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity); + max_int = cgpu->xintensity; + type = 1; + } + else { + glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity); + max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); + } + glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count); + + // if TC * scratchbuf size is too big for memory... reduce to max + if ((glob_thread_count * YESCRYPT_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) { + + /* Selected intensity will not run on this GPU. Not enough memory. + * Adapt the memory setting. */ + // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc + switch (type) { + //raw intensity + case 2: + while ((glob_thread_count * YESCRYPT_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) { + --glob_thread_count; + } + + max_int = glob_thread_count; + cgpu->rawintensity = glob_thread_count; + break; + + //x intensity + case 1: + glob_thread_count = cgpu->max_alloc / YESCRYPT_SCRATCHBUF_SIZE; + max_int = glob_thread_count / clState->compute_shaders; + + while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_XINTENSITY) { + applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu); + max_int = MIN_XINTENSITY; + } + + cgpu->xintensity = max_int; + glob_thread_count = clState->compute_shaders * (1UL << max_int); + break; + + default: + glob_thread_count = cgpu->max_alloc / YESCRYPT_SCRATCHBUF_SIZE; + while (max_int && ((1UL << max_int) & glob_thread_count) == 0) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_INTENSITY) { + applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); + max_int = MIN_INTENSITY; + } + + cgpu->intensity = max_int; + glob_thread_count = 1UL << max_int; + break; + } + } + + // TC is glob thread count + cgpu->thread_concurrency = glob_thread_count; + + applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); + } + + // Lyra2re v2 TC + else if ( !safe_cmp(cgpu->algorithm.name, "lyra2REv2") ) { + size_t glob_thread_count; + long max_int; + unsigned char type = 0; + + // determine which intensity type to use + if (cgpu->rawintensity > 0) { + glob_thread_count = cgpu->rawintensity; + max_int = glob_thread_count; + type = 2; + } + else if (cgpu->xintensity > 0) { + glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity); + max_int = cgpu->xintensity; + type = 1; + } + else { + glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity); + max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); + } + + glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count); + + // if TC * scratchbuf size is too big for memory... reduce to max + if ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) { + + /* Selected intensity will not run on this GPU. Not enough memory. + * Adapt the memory setting. */ + // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc + switch (type) { + //raw intensity + case 2: + while ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) { + --glob_thread_count; + } + + max_int = glob_thread_count; + cgpu->rawintensity = glob_thread_count; + break; + + //x intensity + case 1: + glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE; + max_int = glob_thread_count / clState->compute_shaders; + + while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_XINTENSITY) { + applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu); + max_int = MIN_XINTENSITY; + } + + cgpu->xintensity = max_int; + glob_thread_count = clState->compute_shaders * (1UL << max_int); + break; + + default: + glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE; + while (max_int && ((1UL << max_int) & glob_thread_count) == 0) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_INTENSITY) { + applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); + max_int = MIN_INTENSITY; + } + + cgpu->intensity = max_int; + glob_thread_count = 1UL << max_int; + break; + } + } + + // TC is glob thread count + cgpu->thread_concurrency = glob_thread_count; + + applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); } else if (!cgpu->opt_tc) { unsigned int sixtyfours; @@ -586,7 +755,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } size_t bufsize; - size_t readbufsize = 128; + size_t buf1size; + size_t buf3size; + size_t buf2size; + size_t readbufsize = (!safe_cmp(algorithm->name, "credits")) ? 168 : 128; if (algorithm->rw_buffer_size < 0) { // calc buffer size for neoscrypt @@ -612,6 +784,31 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_DEBUG, "pluck buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); // scrypt/n-scrypt } + else if (!safe_cmp(algorithm->name, "yescrypt") || !safe_cmp(algorithm->name, "yescrypt-multi")) { + /* The scratch/pad-buffer needs 32kBytes memory per thread. */ + bufsize = YESCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency; + buf1size = PLUCK_SECBUF_SIZE * cgpu->thread_concurrency; + buf2size = 128 * 8 * 8 * cgpu->thread_concurrency; + buf3size= 8 * 8 * 4 * cgpu->thread_concurrency; + /* This is the input buffer. For yescrypt this is guaranteed to be + * 80 bytes only. */ + readbufsize = 80; + + applog(LOG_DEBUG, "yescrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize); + // scrypt/n-scrypt + } + else if (!safe_cmp(algorithm->name, "lyra2REv2") ) { + /* The scratch/pad-buffer needs 32kBytes memory per thread. */ + bufsize = LYRA_SCRATCHBUF_SIZE * cgpu->thread_concurrency; + buf1size = 4* 8 * cgpu->thread_concurrency; //matrix + + /* This is the input buffer. For yescrypt this is guaranteed to be + * 80 bytes only. */ + readbufsize = 80; + + applog(LOG_DEBUG, "lyra2REv2 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)buf1size); + // scrypt/n-scrypt + } else { size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0)); bufsize = 128 * ipt * cgpu->thread_concurrency; @@ -624,6 +821,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } clState->padbuffer8 = NULL; + clState->buffer1 = NULL; + clState->buffer2 = NULL; + clState->buffer3 = NULL; if (bufsize > 0) { applog(LOG_DEBUG, "Creating read/write buffer sized %lu", (unsigned long)bufsize); @@ -635,6 +835,42 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize); } + if (!safe_cmp(algorithm->name, "yescrypt") || !safe_cmp(algorithm->name, "yescrypt-multi")) { + // need additionnal buffers + clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer1) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; + } + + clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf2size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer2) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status); + return NULL; + } + + clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf3size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer3) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status); + return NULL; + } + } + else if (!safe_cmp(algorithm->name, "lyra2REv2") ) { + // need additionnal buffers + clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer1) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; + } + } + else { + clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); // we don't need that much just tired... + if (status != CL_SUCCESS && !clState->buffer1) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; + } + } + /* This buffer is weird and might work to some degree even if * the create buffer call has apparently failed, so check if we * get anything back before we call it a failure. */ diff --git a/ocl.h b/ocl.h index 0950d2c0..8d6467f1 100644 --- a/ocl.h +++ b/ocl.h @@ -23,7 +23,10 @@ typedef struct __clState { cl_mem CLbuffer0; cl_mem MidstateBuf; cl_mem padbuffer8; - unsigned char cldata[80]; + cl_mem buffer1; + cl_mem buffer2; + cl_mem buffer3; + unsigned char cldata[168]; bool goffset; cl_uint vwidth; size_t max_work_size; diff --git a/sgminer.c b/sgminer.c index f9e3f096..0c3fed0f 100644 --- a/sgminer.c +++ b/sgminer.c @@ -1919,6 +1919,7 @@ static void calc_midstate(struct work *work) endian_flip32(work->midstate, work->midstate); } + static struct work *make_work(void) { struct work *w = (struct work *)calloc(1, sizeof(struct work)); @@ -2260,7 +2261,9 @@ static bool gbt_decode(struct pool *pool, json_t *res_val) static bool getwork_decode(json_t *res_val, struct work *work) { - if (unlikely(!jobj_binary(res_val, "data", work->data, sizeof(work->data), true))) { + size_t worklen = 128; + worklen = ((!safe_cmp(work->pool->algorithm.name, "credits")) ? sizeof(work->data) : worklen); + if (unlikely(!jobj_binary(res_val, "data", work->data, worklen, true))) { if (opt_morenotices) applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), "")); return false; @@ -3018,10 +3021,17 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s cgpu = get_thr_cgpu(thr_id); - endian_flip128(work->data, work->data); + if (safe_cmp(work->pool->algorithm.name, "credits")) { + endian_flip128(work->data, work->data); + } else { + endian_flip168(work->data, work->data); + } /* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */ - hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data))); + int datasize = 128; + if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) datasize = 80; + else if (!safe_cmp(work->pool->algorithm.name, "credits")) datasize = 168; + hexstr = bin2hex(work->data, datasize); /* build JSON-RPC request */ if (work->gbt) { @@ -7060,7 +7070,10 @@ void inc_hw_errors(struct thr_info *thr) /* Fills in the work nonce and builds the output data in work->hash */ static void rebuild_nonce(struct work *work, uint32_t nonce) { - uint32_t *work_nonce = (uint32_t *)(work->data + 76); + uint32_t nonce_pos = 76; + if (!safe_cmp(work->pool->algorithm.name, "credits")) nonce_pos = 140; + + uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos); *work_nonce = htole32(nonce); @@ -7076,7 +7089,10 @@ bool test_nonce(struct work *work, uint32_t nonce) rebuild_nonce(work, nonce); // for Neoscrypt, the diff1targ value is in work->target - if (!safe_cmp(work->pool->algorithm.name, "neoscrypt") || !safe_cmp(work->pool->algorithm.name, "pluck")) { + if (!safe_cmp(work->pool->algorithm.name, "neoscrypt") || !safe_cmp(work->pool->algorithm.name, "pluck") + || !safe_cmp(work->pool->algorithm.name, "yescrypt") + || !safe_cmp(work->pool->algorithm.name, "yescrypt-multi") + ) { diff1targ = ((uint32_t *)work->target)[7]; } else { diff --git a/sph/Makefile.am b/sph/Makefile.am index d80e438a..bc2f4b23 100644 --- a/sph/Makefile.am +++ b/sph/Makefile.am @@ -1,3 +1,3 @@ noinst_LIBRARIES = libsph.a -libsph_a_SOURCES = bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c +libsph_a_SOURCES = bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c sha256_Y.c diff --git a/sph/sha256_Y.c b/sph/sha256_Y.c new file mode 100644 index 00000000..a5d786d3 --- /dev/null +++ b/sph/sha256_Y.c @@ -0,0 +1,418 @@ +/*- + * Copyright 2005,2007,2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include + +#include +#include + +#include "algorithm/sysendian.h" + +#include "sph/sha256_Y.h" + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, k) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + k) + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t * state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + /* 1. Prepare message schedule W. */ + be32dec_vect(W, block, 64); + + for (i = 16; i < 64; i++) + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0, 0x428a2f98); + RNDr(S, W, 1, 0x71374491); + RNDr(S, W, 2, 0xb5c0fbcf); + RNDr(S, W, 3, 0xe9b5dba5); + RNDr(S, W, 4, 0x3956c25b); + RNDr(S, W, 5, 0x59f111f1); + RNDr(S, W, 6, 0x923f82a4); + RNDr(S, W, 7, 0xab1c5ed5); + RNDr(S, W, 8, 0xd807aa98); + RNDr(S, W, 9, 0x12835b01); + RNDr(S, W, 10, 0x243185be); + RNDr(S, W, 11, 0x550c7dc3); + RNDr(S, W, 12, 0x72be5d74); + RNDr(S, W, 13, 0x80deb1fe); + RNDr(S, W, 14, 0x9bdc06a7); + RNDr(S, W, 15, 0xc19bf174); + RNDr(S, W, 16, 0xe49b69c1); + RNDr(S, W, 17, 0xefbe4786); + RNDr(S, W, 18, 0x0fc19dc6); + RNDr(S, W, 19, 0x240ca1cc); + RNDr(S, W, 20, 0x2de92c6f); + RNDr(S, W, 21, 0x4a7484aa); + RNDr(S, W, 22, 0x5cb0a9dc); + RNDr(S, W, 23, 0x76f988da); + RNDr(S, W, 24, 0x983e5152); + RNDr(S, W, 25, 0xa831c66d); + RNDr(S, W, 26, 0xb00327c8); + RNDr(S, W, 27, 0xbf597fc7); + RNDr(S, W, 28, 0xc6e00bf3); + RNDr(S, W, 29, 0xd5a79147); + RNDr(S, W, 30, 0x06ca6351); + RNDr(S, W, 31, 0x14292967); + RNDr(S, W, 32, 0x27b70a85); + RNDr(S, W, 33, 0x2e1b2138); + RNDr(S, W, 34, 0x4d2c6dfc); + RNDr(S, W, 35, 0x53380d13); + RNDr(S, W, 36, 0x650a7354); + RNDr(S, W, 37, 0x766a0abb); + RNDr(S, W, 38, 0x81c2c92e); + RNDr(S, W, 39, 0x92722c85); + RNDr(S, W, 40, 0xa2bfe8a1); + RNDr(S, W, 41, 0xa81a664b); + RNDr(S, W, 42, 0xc24b8b70); + RNDr(S, W, 43, 0xc76c51a3); + RNDr(S, W, 44, 0xd192e819); + RNDr(S, W, 45, 0xd6990624); + RNDr(S, W, 46, 0xf40e3585); + RNDr(S, W, 47, 0x106aa070); + RNDr(S, W, 48, 0x19a4c116); + RNDr(S, W, 49, 0x1e376c08); + RNDr(S, W, 50, 0x2748774c); + RNDr(S, W, 51, 0x34b0bcb5); + RNDr(S, W, 52, 0x391c0cb3); + RNDr(S, W, 53, 0x4ed8aa4a); + RNDr(S, W, 54, 0x5b9cca4f); + RNDr(S, W, 55, 0x682e6ff3); + RNDr(S, W, 56, 0x748f82ee); + RNDr(S, W, 57, 0x78a5636f); + RNDr(S, W, 58, 0x84c87814); + RNDr(S, W, 59, 0x8cc70208); + RNDr(S, W, 60, 0x90befffa); + RNDr(S, W, 61, 0xa4506ceb); + RNDr(S, W, 62, 0xbef9a3f7); + RNDr(S, W, 63, 0xc67178f2); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) { + state[i] += S[i]; + +} + + /* Clean the stack. */ + memset(W, 0, 256); + memset(S, 0, 32); + t0 = t1 = 0; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX_Y * ctx) +{ + unsigned char len[8]; + uint32_t r, plen; + + /* + * Convert length to a vector of bytes -- we do this now rather + * than later because the length will change after we pad. + */ + be32enc_vect(len, ctx->count, 8); + + /* Add 1--64 bytes so that the resulting length is 56 mod 64 */ + r = (ctx->count[1] >> 3) & 0x3f; + plen = (r < 56) ? (56 - r) : (120 - r); + SHA256_Update_Y(ctx, PAD, (size_t)plen); + + /* Add the terminating bit-count */ + SHA256_Update_Y(ctx, len, 8); +} + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +void +SHA256_Init_Y(SHA256_CTX_Y * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +void +SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len) +{ + uint32_t bitlen[2]; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint32_t)len) << 3; + bitlen[0] = (uint32_t)(len >> 29); + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx) +{ + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, 32); + + /* Clear the context state */ + memset((void *)ctx, 0, sizeof(*ctx)); +} + +/* Initialize an HMAC-SHA256 operation with the given key. */ +void +HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen) +{ + unsigned char pad[64]; + unsigned char khash[32]; + const unsigned char * K = _K; + size_t i; + + /* If Klen > 64, the key is really SHA256(K). */ + if (Klen > 64) { + SHA256_Init_Y(&ctx->ictx); + SHA256_Update_Y(&ctx->ictx, K, Klen); + SHA256_Final_Y(khash, &ctx->ictx); + K = khash; + Klen = 32; + } + + /* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */ + SHA256_Init_Y(&ctx->ictx); + memset(pad, 0x36, 64); + for (i = 0; i < Klen; i++) { + pad[i] ^= K[i]; + } + SHA256_Update_Y(&ctx->ictx, pad, 64); + + /* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */ + SHA256_Init_Y(&ctx->octx); + memset(pad, 0x5c, 64); + for (i = 0; i < Klen; i++) + { + pad[i] ^= K[i]; + } + SHA256_Update_Y(&ctx->octx, pad, 64); + + /* Clean the stack. */ + memset(khash, 0, 32); +} + +/* Add bytes to the HMAC-SHA256 operation. */ +void +HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len) +{ + /* Feed data to the inner SHA256 operation. */ + SHA256_Update_Y(&ctx->ictx, in, len); +} + +/* Finish an HMAC-SHA256 operation. */ +void +HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx) +{ + unsigned char ihash[32]; + + /* Finish the inner SHA256 operation. */ + SHA256_Final_Y(ihash, &ctx->ictx); + + /* Feed the inner hash to the outer SHA256 operation. */ + SHA256_Update_Y(&ctx->octx, ihash, 32); + + /* Finish the outer SHA256 operation. */ + SHA256_Final_Y(digest, &ctx->octx); + + /* Clean the stack. */ + memset(ihash, 0, 32); +} + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ + +void +PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt, +size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen) +{ + HMAC_SHA256_CTX_Y PShctx, hctx; + size_t i; + uint8_t ivec[4]; + uint8_t U[32]; + uint8_t T[32]; + uint64_t j; + int k; + size_t clen; + + /* Compute HMAC state after processing P and S. */ + HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen); + HMAC_SHA256_Update_Y(&PShctx, salt, saltlen); + + /* Iterate through the blocks. */ + for (i = 0; i * 32 < dkLen; i++) { + /* Generate INT(i + 1). */ + be32enc(ivec, (uint32_t)(i + 1)); + + /* Compute U_1 = PRF(P, S || INT(i)). */ + memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y)); + HMAC_SHA256_Update_Y(&hctx, ivec, 4); + HMAC_SHA256_Final_Y(U, &hctx); + + /* T_i = U_1 ... */ + memcpy(T, U, 32); + + for (j = 2; j <= c; j++) { + /* Compute U_j. */ + HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen); + HMAC_SHA256_Update_Y(&hctx, U, 32); + HMAC_SHA256_Final_Y(U, &hctx); + + /* ... xor U_j ... */ + for (k = 0; k < 32; k++) + T[k] ^= U[k]; + } + + /* Copy as many bytes as necessary into buf. */ + clen = dkLen - i * 32; + if (clen > 32) + clen = 32; + memcpy(&buf[i * 32], T, clen); + } + + /* Clean PShctx, since we never called _Final on it. */ + memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y)); +} diff --git a/sph/sha256_Y.h b/sph/sha256_Y.h new file mode 100644 index 00000000..e97b81ba --- /dev/null +++ b/sph/sha256_Y.h @@ -0,0 +1,63 @@ +/*- + * Copyright 2005,2007,2009 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $ + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#include + +#include + +typedef struct SHA256Context { + uint32_t state[8]; + uint32_t count[2]; + unsigned char buf[64]; +} SHA256_CTX_Y; + +typedef struct HMAC_SHA256Context { + SHA256_CTX_Y ictx; + SHA256_CTX_Y octx; +} HMAC_SHA256_CTX_Y; + +void SHA256_Init_Y(SHA256_CTX_Y *); +void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t); +void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *); +void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); +void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t); +void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *); + +/** + * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen): + * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and + * write the output to buf. The value dkLen must be at most 32 * (2^32 - 1). + */ +void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t, + uint64_t, uint8_t *, size_t); + + +#endif /* !_SHA256_H_ */