diff --git a/Makefile.am b/Makefile.am index 89a8547a..bf68583e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -70,6 +70,7 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h sgminer_SOURCES += algorithm/pluck.c algorithm/pluck.h sgminer_SOURCES += algorithm/credits.c algorithm/credits.h +sgminer_SOURCES += algorithm/Lyra2RE_old.c algorithm/Lyra2RE_old.h algorithm/Lyra2_old.c algorithm/Lyra2_old.h algorithm/Sponge_old.c algorithm/Sponge_old.h sgminer_SOURCES += algorithm/Lyra2RE.c algorithm/Lyra2RE.h algorithm/Lyra2.c algorithm/Lyra2.h algorithm/Sponge.c algorithm/Sponge.h sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h diff --git a/algorithm.c b/algorithm.c index 8302ecd1..5430d773 100644 --- a/algorithm.c +++ b/algorithm.c @@ -31,7 +31,8 @@ #include "algorithm/fresh.h" #include "algorithm/whirlcoin.h" #include "algorithm/neoscrypt.h" -#include "algorithm/Lyra2RE.h" +#include "algorithm/Lyra2RE.h" //lyra new version +#include "algorithm/Lyra2RE_old.h" //lyra old version #include "algorithm/pluck.h" #include "algorithm/yescrypt.h" #include "algorithm/credits.h" @@ -59,6 +60,7 @@ const char *algorithm_type_str[] = { "Whirlcoin", "Neoscrypt", "Lyra2RE", + "Lyta2REv2" "pluck", "yescrypt", "yescrypt-multi" @@ -409,6 +411,62 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct return status; } +static cl_int queue_lyra2REv2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) +{ + cl_kernel *kernel; + unsigned int num; + cl_int status = 0; + cl_ulong le_target; + + // le_target = *(cl_uint *)(blk->work->device_target + 28); + le_target = *(cl_ulong *)(blk->work->device_target + 24); + flip80(clState->cldata, blk->work->data); + status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL); + + // blake - search + kernel = &clState->kernel; + num = 0; + // CL_SET_ARG(clState->CLbuffer0); + CL_SET_ARG(clState->buffer1); + CL_SET_ARG(blk->work->blk.ctx_a); + CL_SET_ARG(blk->work->blk.ctx_b); + CL_SET_ARG(blk->work->blk.ctx_c); + CL_SET_ARG(blk->work->blk.ctx_d); + CL_SET_ARG(blk->work->blk.ctx_e); + CL_SET_ARG(blk->work->blk.ctx_f); + CL_SET_ARG(blk->work->blk.ctx_g); + CL_SET_ARG(blk->work->blk.ctx_h); + CL_SET_ARG(blk->work->blk.cty_a); + CL_SET_ARG(blk->work->blk.cty_b); + CL_SET_ARG(blk->work->blk.cty_c); + + // keccak - search1 + kernel = clState->extra_kernels; + CL_SET_ARG_0(clState->buffer1); + // cubehash - search2 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // lyra - search3 + num = 0; + CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1); + CL_SET_ARG_N(1, clState->padbuffer8); + // skein -search4 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // cubehash - search5 + num = 0; + CL_NEXTKERNEL_SET_ARG_0(clState->buffer1); + // bmw - search6 + num = 0; + CL_NEXTKERNEL_SET_ARG(clState->buffer1); + CL_SET_ARG(clState->outputBuffer); + CL_SET_ARG(le_target); + + return status; +} + + + static cl_int queue_darkcoin_mod_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads) { @@ -927,7 +985,10 @@ static algorithm_settings_t algos[] = { { "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL}, - { "Lyra2RE", ALGO_LYRA2RE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4,2 * 8 * 4194304 , 0, lyra2re_regenhash, queue_lyra2RE_kernel, gen_hash, NULL}, + { "Lyra2RE", ALGO_LYRA2RE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4,2 * 8 * 4194304 , 0, lyra2reold_regenhash, queue_lyra2RE_kernel, gen_hash, NULL}, + + { "Lyra2REv2", ALGO_LYRA2REv2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2re_regenhash, queue_lyra2REv2_kernel, gen_hash, append_neoscrypt_compiler_options }, + // kernels starting from this will have difficulty calculated by using fuguecoin algorithm #define A_FUGUE(a, b, c) \ @@ -1035,7 +1096,7 @@ void set_algorithm(algorithm_t* algo, const char* newname_alias) // use old nfactor if it was previously set and is different than the one set by alias if ((old_nfactor > 0) && (old_nfactor != nfactor)) nfactor = old_nfactor; - if (algo->type == ALGO_LYRA2RE) {opt_lyra = true;} + if (algo->type == ALGO_LYRA2RE || algo->type == ALGO_LYRA2REv2 ) { opt_lyra = true; } set_algorithm_nfactor(algo, nfactor); //reapply kernelfile if was set diff --git a/algorithm.h b/algorithm.h index c412af92..e24421de 100644 --- a/algorithm.h +++ b/algorithm.h @@ -28,6 +28,7 @@ typedef enum { ALGO_WHIRL, ALGO_NEOSCRYPT, ALGO_LYRA2RE, + ALGO_LYRA2REv2, ALGO_PLUCK, ALGO_YESCRYPT, ALGO_YESCRYPT_MULTI, diff --git a/algorithm/Lyra2.c b/algorithm/Lyra2.c index 412aa1f6..aa7d207e 100644 --- a/algorithm/Lyra2.c +++ b/algorithm/Lyra2.c @@ -58,6 +58,11 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //========== Initializing the Memory Matrix and pointers to it =============// //Tries to allocate enough space for the whole memory matrix + + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); uint64_t *wholeMatrix = malloc(i); if (wholeMatrix == NULL) { @@ -130,16 +135,16 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * ptrWord = wholeMatrix; for (i = 0; i < nBlocksInput; i++) { absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) - ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil) } //Initializes M[0] and M[1] - reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here - reducedDuplexRow1(state, memMatrix[0], memMatrix[1]); + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); do { //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) - reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //updates the value of row* (deterministically picked during Setup)) @@ -172,7 +177,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * //------------------------------------------------------------------------------------------ //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] - reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); //update prev: it now points to the last row ever computed prev = row; diff --git a/algorithm/Lyra2.h b/algorithm/Lyra2.h index 13c7dbd3..c7908945 100644 --- a/algorithm/Lyra2.h +++ b/algorithm/Lyra2.h @@ -1,8 +1,8 @@ /** * Header file for the Lyra2 Password Hashing Scheme (PHS). - * + * * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. - * + * * This software is hereby placed in the public domain. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS @@ -37,14 +37,6 @@ typedef unsigned char byte; #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes #endif -#ifndef N_COLS - #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default -#endif - -#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks -#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row - - int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); #endif /* LYRA2_H_ */ diff --git a/algorithm/Lyra2RE.c b/algorithm/Lyra2RE.c index 24b8210c..f6d36df8 100644 --- a/algorithm/Lyra2RE.c +++ b/algorithm/Lyra2RE.c @@ -36,6 +36,8 @@ #include "sph/sph_groestl.h" #include "sph/sph_skein.h" #include "sph/sph_keccak.h" +#include "sph/sph_bmw.h" +#include "sph/sph_cubehash.h" #include "Lyra2.h" /* @@ -55,34 +57,37 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) inline void lyra2rehash(void *state, const void *input) { sph_blake256_context ctx_blake; - sph_groestl256_context ctx_groestl; + sph_bmw256_context ctx_bmw; sph_keccak256_context ctx_keccak; sph_skein256_context ctx_skein; - + sph_cubehash256_context ctx_cube; uint32_t hashA[8], hashB[8]; sph_blake256_init(&ctx_blake); sph_blake256 (&ctx_blake, input, 80); sph_blake256_close (&ctx_blake, hashA); - - - sph_keccak256_init(&ctx_keccak); sph_keccak256 (&ctx_keccak,hashA, 32); sph_keccak256_close(&ctx_keccak, hashB); - LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashB, 32); + sph_cubehash256_close(&ctx_cube, hashA); + LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); sph_skein256_init(&ctx_skein); - sph_skein256 (&ctx_skein, hashA, 32); - sph_skein256_close(&ctx_skein, hashB); + sph_skein256 (&ctx_skein, hashB, 32); + sph_skein256_close(&ctx_skein, hashA); + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashA, 32); + sph_cubehash256_close(&ctx_cube, hashB); - sph_groestl256_init(&ctx_groestl); - sph_groestl256 (&ctx_groestl, hashB, 32); - sph_groestl256_close(&ctx_groestl, hashA); + sph_bmw256_init(&ctx_bmw); + sph_bmw256 (&ctx_bmw, hashB, 32); + sph_bmw256_close(&ctx_bmw, hashA); //printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]); diff --git a/algorithm/Lyra2RE.h b/algorithm/Lyra2RE.h index 8a58e747..f4698c65 100644 --- a/algorithm/Lyra2RE.h +++ b/algorithm/Lyra2RE.h @@ -2,7 +2,8 @@ #define LYRA2RE_H #include "miner.h" - +#define LYRA_SCRATCHBUF_SIZE (4 ) // matrix extended to 16 matrix[16][8][8] uint64_t or equivalent +#define LYRA_SECBUF_SIZE (4) //8 uint64 extern int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce); extern void lyra2re_regenhash(struct work *work); diff --git a/algorithm/Lyra2RE_old.c b/algorithm/Lyra2RE_old.c new file mode 100644 index 00000000..f1e9aee5 --- /dev/null +++ b/algorithm/Lyra2RE_old.c @@ -0,0 +1,169 @@ +/*- + * Copyright 2014 James Lovejoy + * Copyright 2014 phm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "config.h" +#include "miner.h" + +#include +#include +#include + +#include "sph/sph_blake.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_keccak.h" +#include "Lyra2_old.h" + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static inline void +be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) +{ + uint32_t i; + + for (i = 0; i < len; i++) + dst[i] = htobe32(src[i]); +} + + +inline void lyra2rehash_old(void *state, const void *input) +{ + sph_blake256_context ctx_blake; + sph_groestl256_context ctx_groestl; + sph_keccak256_context ctx_keccak; + sph_skein256_context ctx_skein; + + uint32_t hashA[8], hashB[8]; + + sph_blake256_init(&ctx_blake); + sph_blake256 (&ctx_blake, input, 80); + sph_blake256_close (&ctx_blake, hashA); + + + + + sph_keccak256_init(&ctx_keccak); + sph_keccak256 (&ctx_keccak,hashA, 32); + sph_keccak256_close(&ctx_keccak, hashB); + + LYRA2O(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + + + sph_skein256_init(&ctx_skein); + sph_skein256 (&ctx_skein, hashA, 32); + sph_skein256_close(&ctx_skein, hashB); + + + sph_groestl256_init(&ctx_groestl); + sph_groestl256 (&ctx_groestl, hashB, 32); + sph_groestl256_close(&ctx_groestl, hashA); + +//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]); + + memcpy(state, hashA, 32); +} + +static const uint32_t diff1targ = 0x0000ffff; + + +/* Used externally as confirmation of correct OCL code */ +int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce) +{ + uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]); + uint32_t data[20], ohash[8]; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + data[19] = htobe32(nonce); + lyra2rehash(ohash, data); + tmp_hash7 = be32toh(ohash[7]); + + applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx", + (long unsigned int)Htarg, + (long unsigned int)diff1targ, + (long unsigned int)tmp_hash7); + if (tmp_hash7 > diff1targ) + return -1; + if (tmp_hash7 > Htarg) + return 0; + return 1; +} + +void lyra2reold_regenhash(struct work *work) +{ + uint32_t data[20]; + uint32_t *nonce = (uint32_t *)(work->data + 76); + uint32_t *ohash = (uint32_t *)(work->hash); + + be32enc_vect(data, (const uint32_t *)work->data, 19); + data[19] = htobe32(*nonce); + lyra2rehash(ohash, data); +} + +bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate, + unsigned char *pdata, unsigned char __maybe_unused *phash1, + unsigned char __maybe_unused *phash, const unsigned char *ptarget, + uint32_t max_nonce, uint32_t *last_nonce, uint32_t n) +{ + uint32_t *nonce = (uint32_t *)(pdata + 76); + uint32_t data[20]; + uint32_t tmp_hash7; + uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]); + bool ret = false; + + be32enc_vect(data, (const uint32_t *)pdata, 19); + + while(1) { + uint32_t ostate[8]; + + *nonce = ++n; + data[19] = (n); + lyra2rehash(ostate, data); + tmp_hash7 = (ostate[7]); + + applog(LOG_INFO, "data7 %08lx", + (long unsigned int)data[7]); + + if (unlikely(tmp_hash7 <= Htarg)) { + ((uint32_t *)pdata)[19] = htobe32(n); + *last_nonce = n; + ret = true; + break; + } + + if (unlikely((n >= max_nonce) || thr->work_restart)) { + *last_nonce = n; + break; + } + } + + return ret; +} + + + diff --git a/algorithm/Lyra2RE_old.h b/algorithm/Lyra2RE_old.h new file mode 100644 index 00000000..0788dfb3 --- /dev/null +++ b/algorithm/Lyra2RE_old.h @@ -0,0 +1,10 @@ +#ifndef LYRA2REOLD_H +#define LYRA2REOLD_H + +#include "miner.h" + +extern int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, + uint32_t nonce); +extern void lyra2reold_regenhash(struct work *work); + +#endif /* LYRA2RE_H */ diff --git a/algorithm/Lyra2_old.c b/algorithm/Lyra2_old.c new file mode 100644 index 00000000..f78c4903 --- /dev/null +++ b/algorithm/Lyra2_old.c @@ -0,0 +1,208 @@ +/** + * Implementation of the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include +#include "Lyra2_old.h" +#include "Sponge_old.h" + +/** + * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords + * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, + * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all + * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value + * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). + * + * @param K The derived key to be output by the algorithm + * @param kLen Desired key length + * @param pwd User password + * @param pwdlen Password length + * @param salt Salt + * @param saltlen Salt length + * @param timeCost Parameter to determine the processing time (T) + * @param nRows Number or rows of the memory matrix (R) + * @param nCols Number of columns of the memory matrix (C) + * + * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) + */ +int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) { + + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); + uint64_t *wholeMatrix = malloc(i); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, i); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*)); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + byte *ptrByte = (byte*) wholeMatrix; + memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &saltlen, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &timeCost, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nRows, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + memcpy(ptrByte, &nCols, sizeof (uint64_t)); + ptrByte += sizeof (uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t *state = malloc(16 * sizeof (uint64_t)); + if (state == NULL) { + return -1; + } + initState(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) { + absorbBlockBlake2SafeO(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0O(state, memMatrix[0]); //The locally copied password is most likely overwritten here + reducedDuplexRow1O(state, memMatrix[0], memMatrix[1]); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + reducedDuplexRowSetupO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + //Selects a pseudorandom index row* + //------------------------------------------------------------------------------------------ + //rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRowO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + //row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + //==========================================================================/ + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlockO(state, memMatrix[rowa]); + + //Squeezes the key + squeezeO(state, K, kLen); + //==========================================================================/ + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + //Wiping out the sponge's internal state before freeing it + memset(state, 0, 16 * sizeof (uint64_t)); + free(state); + //==========================================================================/ + + return 0; +} diff --git a/algorithm/Lyra2_old.h b/algorithm/Lyra2_old.h new file mode 100644 index 00000000..9dbe5668 --- /dev/null +++ b/algorithm/Lyra2_old.h @@ -0,0 +1,50 @@ +/** + * Header file for the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef LYRA2OLD_H_ +#define LYRA2OLD_H_ + +#include + +typedef unsigned char byte; + +//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED) +#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t) +#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes + + +#ifdef BLOCK_LEN_BITS + #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes +#else //default block lenght: 768 bits + #define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes +#endif + +#ifndef N_COLS + #define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default +#endif + +#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks +#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row + + +int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); + +#endif /* LYRA2_H_ */ diff --git a/algorithm/Sponge.c b/algorithm/Sponge.c index 0aa5aace..8ece6f99 100644 --- a/algorithm/Sponge.c +++ b/algorithm/Sponge.c @@ -1,9 +1,9 @@ /** - * A simple implementation of Blake2b's internal permutation + * A simple implementation of Blake2b's internal permutation * in the form of a sponge. - * + * * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. - * + * * This software is hereby placed in the public domain. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS @@ -27,19 +27,19 @@ /** - * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder + * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges * typically have their internal state initialized with zeros, Blake2b's G function - * has a fixed point: if the internal state and message are both filled with zeros. the - * resulting permutation will always be a block filled with zeros; this happens because - * Blake2b does not use the constants originally employed in Blake2 inside its G function, + * has a fixed point: if the internal state and message are both filled with zeros. the + * resulting permutation will always be a block filled with zeros; this happens because + * Blake2b does not use the constants originally employed in Blake2 inside its G function, * relying on the IV for avoiding possible fixed points. - * + * * @param state The 1024-bit array to be initialized */ inline void initState(uint64_t state[/*16*/]) { //First 512 bis are zeros - memset(state, 0, 64); + memset(state, 0, 64); //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV state[8] = blake2b_IV[0]; state[9] = blake2b_IV[1]; @@ -53,7 +53,7 @@ inline void initState(uint64_t state[/*16*/]) { /** * Execute Blake2b's G function, with all 12 rounds. - * + * * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function */ inline static void blake2bLyra(uint64_t *v) { @@ -80,10 +80,10 @@ inline static void reducedBlake2bLyra(uint64_t *v) { } /** - * Performs a squeeze operation, using Blake2b's G function as the + * Performs a squeeze operation, using Blake2b's G function as the * internal permutation - * - * @param state The current state of the sponge + * + * @param state The current state of the sponge * @param out Array that will receive the data squeezed * @param len The number of bytes to be squeezed into the "out" array */ @@ -105,8 +105,8 @@ inline void squeeze(uint64_t *state, byte *out, unsigned int len) { /** * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words * of type uint64_t), using Blake2b's G function as the internal permutation - * - * @param state The current state of the sponge + * + * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_INT64 words) */ inline void absorbBlock(uint64_t *state, const uint64_t *in) { @@ -129,15 +129,16 @@ inline void absorbBlock(uint64_t *state, const uint64_t *in) { } /** - * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 + * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 * words of type uint64_t), using Blake2b's G function as the internal permutation - * - * @param state The current state of the sponge + * + * @param state The current state of the sponge * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) */ inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state - state[0] ^= in[0]; + + state[0] ^= in[0]; state[1] ^= in[1]; state[2] ^= in[2]; state[3] ^= in[3]; @@ -146,23 +147,25 @@ inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { state[6] ^= in[6]; state[7] ^= in[7]; + //Applies the transformation f to the sponge's state blake2bLyra(state); + } -/** - * Performs a reduced squeeze operation for a single row, from the highest to - * the lowest index, using the reduced-round Blake2b's G function as the +/** + * Performs a reduced squeeze operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the * internal permutation - * - * @param state The current state of the sponge + * + * @param state The current state of the sponge * @param rowOut Row to receive the data squeezed */ -inline void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { - uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] +inline void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, uint64_t nCols) { + uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] int i; - //M[row][C-1-col] = H.reduced_squeeze() - for (i = 0; i < N_COLS; i++) { + //M[row][C-1-col] = H.reduced_squeeze() + for (i = 0; i < nCols; i++) { ptrWord[0] = state[0]; ptrWord[1] = state[1]; ptrWord[2] = state[2]; @@ -184,21 +187,21 @@ inline void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { } } -/** - * Performs a reduced duplex operation for a single row, from the highest to - * the lowest index, using the reduced-round Blake2b's G function as the +/** + * Performs a reduced duplex operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the * internal permutation - * - * @param state The current state of the sponge + * + * @param state The current state of the sponge * @param rowIn Row to feed the sponge * @param rowOut Row to receive the sponge's output */ -inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { +inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { //Absorbing "M[prev][col]" state[0] ^= (ptrWordIn[0]); @@ -230,8 +233,8 @@ inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut ptrWordOut[9] = ptrWordIn[9] ^ state[9]; ptrWordOut[10] = ptrWordIn[10] ^ state[10]; ptrWordOut[11] = ptrWordIn[11] ^ state[11]; - - + + //Input: next column (i.e., next block in sequence) ptrWordIn += BLOCK_LEN_INT64; //Output: goes to previous column @@ -240,26 +243,26 @@ inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut } /** - * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., * the wordwise addition of two columns, ignoring carries between words). The - * output of this operation, "rand", is then used to make - * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and - * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * output of this operation, "rand", is then used to make + * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left and N_COLS is a system parameter. * - * @param state The current state of the sponge + * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output * */ -inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { +inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* - uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); @@ -290,7 +293,7 @@ inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *ro ptrWordOut[9] = ptrWordIn[9] ^ state[9]; ptrWordOut[10] = ptrWordIn[10] ^ state[10]; ptrWordOut[11] = ptrWordIn[11] ^ state[11]; - + //M[row*][col] = M[row*][col] XOR rotW(rand) ptrWordInOut[0] ^= state[11]; ptrWordInOut[1] ^= state[0]; @@ -314,26 +317,26 @@ inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *ro } /** - * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., * the wordwise addition of two columns, ignoring carries between words). The - * output of this operation, "rand", is then used to make - * "M[rowOut][col] = M[rowOut][col] XOR rand" and - * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * output of this operation, "rand", is then used to make + * "M[rowOut][col] = M[rowOut][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left. * - * @param state The current state of the sponge + * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output * */ -inline void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { +inline void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) { uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row int i; - for (i = 0; i < N_COLS; i++) { + for (i = 0; i < nCols; i++) { //Absorbing "M[prev] [+] M[row*]" state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); @@ -392,10 +395,10 @@ inline void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOu /** * Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left. * - * @param state The current state of the sponge + * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output @@ -465,10 +468,10 @@ inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t /** * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left. * - * @param state The current state of the sponge + * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output @@ -539,10 +542,10 @@ inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t * /** * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" - * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit + * on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit * rotation to the left. * - * @param state The current state of the sponge + * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output @@ -668,10 +671,10 @@ inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t /** * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand" - * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)", + * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)", * where rotW is a 64-bit rotation to the left. * - * @param state The current state of the sponge + * @param state The current state of the sponge * @param rowIn Row used only as input * @param rowInOut Row used as input and to receive output after rotation * @param rowOut Row receiving the output @@ -717,7 +720,7 @@ inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInO ptrWordOut[11] ^= state[11]; //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) - + //Goes to next block ptrWordOut += BLOCK_LEN_INT64; diff --git a/algorithm/Sponge.h b/algorithm/Sponge.h index 3fcff0d7..4ea1dc93 100644 --- a/algorithm/Sponge.h +++ b/algorithm/Sponge.h @@ -1,10 +1,10 @@ /** - * Header file for Blake2b's internal permutation in the form of a sponge. - * This code is based on the original Blake2b's implementation provided by + * Header file for Blake2b's internal permutation in the form of a sponge. + * This code is based on the original Blake2b's implementation provided by * Samuel Neves (https://blake2.net/) - * + * * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. - * + * * This software is hereby placed in the public domain. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS @@ -78,16 +78,16 @@ void initState(uint64_t state[/*16*/]); //---- Squeezes void squeeze(uint64_t *state, unsigned char *out, unsigned int len); -void reducedSqueezeRow0(uint64_t* state, uint64_t* row); +void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols); //---- Absorbs void absorbBlock(uint64_t *state, const uint64_t *in); void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); //---- Duplexes -void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); -void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); -void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); +void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols); //---- Misc void printArray(unsigned char *array, unsigned int size, char *name); diff --git a/algorithm/Sponge_old.c b/algorithm/Sponge_old.c new file mode 100644 index 00000000..aa6c3017 --- /dev/null +++ b/algorithm/Sponge_old.c @@ -0,0 +1,405 @@ +/** + * A simple implementation of Blake2b's internal permutation + * in the form of a sponge. + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include "Sponge_old.h" +#include "Lyra2_old.h" + + + +/** + * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder + * receive Blake2b's IV as per Blake2b's specification. Note: Even though sponges + * typically have their internal state initialized with zeros, Blake2b's G function + * has a fixed point: if the internal state and message are both filled with zeros. the + * resulting permutation will always be a block filled with zeros; this happens because + * Blake2b does not use the constants originally employed in Blake2 inside its G function, + * relying on the IV for avoiding possible fixed points. + * + * @param state The 1024-bit array to be initialized + */ +inline void initStateO(uint64_t state[/*16*/]) { + //First 512 bis are zeros + memset(state, 0, 64); + //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV + state[8] = blake2b_IV[0]; + state[9] = blake2b_IV[1]; + state[10] = blake2b_IV[2]; + state[11] = blake2b_IV[3]; + state[12] = blake2b_IV[4]; + state[13] = blake2b_IV[5]; + state[14] = blake2b_IV[6]; + state[15] = blake2b_IV[7]; +} + +/** + * Execute Blake2b's G function, with all 12 rounds. + * + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +inline static void blake2bLyra(uint64_t *v) { + ROUND_LYRA(0); + ROUND_LYRA(1); + ROUND_LYRA(2); + ROUND_LYRA(3); + ROUND_LYRA(4); + ROUND_LYRA(5); + ROUND_LYRA(6); + ROUND_LYRA(7); + ROUND_LYRA(8); + ROUND_LYRA(9); + ROUND_LYRA(10); + ROUND_LYRA(11); +} + +/** + * Executes a reduced version of Blake2b's G function with only one round + * @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function + */ +inline static void reducedBlake2bLyra(uint64_t *v) { + ROUND_LYRA(0); +} + +/** + * Performs a squeeze operation, using Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param out Array that will receive the data squeezed + * @param len The number of bytes to be squeezed into the "out" array + */ +inline void squeezeO(uint64_t *state, byte *out, unsigned int len) { + int fullBlocks = len / BLOCK_LEN_BYTES; + byte *ptr = out; + int i; + //Squeezes full blocks + for (i = 0; i < fullBlocks; i++) { + memcpy(ptr, state, BLOCK_LEN_BYTES); + blake2bLyra(state); + ptr += BLOCK_LEN_BYTES; + } + + //Squeezes remaining bytes + memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words + * of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_INT64 words) + */ +inline void absorbBlockO(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + state[8] ^= in[8]; + state[9] ^= in[9]; + state[10] ^= in[10]; + state[11] ^= in[11]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +} + +/** + * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 + * words of type uint64_t), using Blake2b's G function as the internal permutation + * + * @param state The current state of the sponge + * @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) + */ +inline void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in) { + //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state + state[0] ^= in[0]; + state[1] ^= in[1]; + state[2] ^= in[2]; + state[3] ^= in[3]; + state[4] ^= in[4]; + state[5] ^= in[5]; + state[6] ^= in[6]; + state[7] ^= in[7]; + + //Applies the transformation f to the sponge's state + blake2bLyra(state); +} + +/** + * Performs a reduced squeeze operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowOut Row to receive the data squeezed + */ +inline void reducedSqueezeRow0O(uint64_t* state, uint64_t* rowOut) { + uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1] + int i; + //M[row][C-1-col] = H.reduced_squeeze() + for (i = 0; i < N_COLS; i++) { + ptrWord[0] = state[0]; + ptrWord[1] = state[1]; + ptrWord[2] = state[2]; + ptrWord[3] = state[3]; + ptrWord[4] = state[4]; + ptrWord[5] = state[5]; + ptrWord[6] = state[6]; + ptrWord[7] = state[7]; + ptrWord[8] = state[8]; + ptrWord[9] = state[9]; + ptrWord[10] = state[10]; + ptrWord[11] = state[11]; + + //Goes to next block (column) that will receive the squeezed data + ptrWord -= BLOCK_LEN_INT64; + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + } +} + +/** + * Performs a reduced duplex operation for a single row, from the highest to + * the lowest index, using the reduced-round Blake2b's G function as the + * internal permutation + * + * @param state The current state of the sponge + * @param rowIn Row to feed the sponge + * @param rowOut Row to receive the sponge's output + */ +inline void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[prev][col]" + state[0] ^= (ptrWordIn[0]); + state[1] ^= (ptrWordIn[1]); + state[2] ^= (ptrWordIn[2]); + state[3] ^= (ptrWordIn[3]); + state[4] ^= (ptrWordIn[4]); + state[5] ^= (ptrWordIn[5]); + state[6] ^= (ptrWordIn[6]); + state[7] ^= (ptrWordIn[7]); + state[8] ^= (ptrWordIn[8]); + state[9] ^= (ptrWordIn[9]); + state[10] ^= (ptrWordIn[10]); + state[11] ^= (ptrWordIn[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][C-1-col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + + //Input: next column (i.e., next block in sequence) + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left and N_COLS is a system parameter. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +inline void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[row][col] = M[prev][col] XOR rand + ptrWordOut[0] = ptrWordIn[0] ^ state[0]; + ptrWordOut[1] = ptrWordIn[1] ^ state[1]; + ptrWordOut[2] = ptrWordIn[2] ^ state[2]; + ptrWordOut[3] = ptrWordIn[3] ^ state[3]; + ptrWordOut[4] = ptrWordIn[4] ^ state[4]; + ptrWordOut[5] = ptrWordIn[5] ^ state[5]; + ptrWordOut[6] = ptrWordIn[6] ^ state[6]; + ptrWordOut[7] = ptrWordIn[7] ^ state[7]; + ptrWordOut[8] = ptrWordIn[8] ^ state[8]; + ptrWordOut[9] = ptrWordIn[9] ^ state[9]; + ptrWordOut[10] = ptrWordIn[10] ^ state[10]; + ptrWordOut[11] = ptrWordIn[11] ^ state[11]; + + //M[row*][col] = M[row*][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Inputs: next column (i.e., next block in sequence) + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + //Output: goes to previous column + ptrWordOut -= BLOCK_LEN_INT64; + } +} + +/** + * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., + * the wordwise addition of two columns, ignoring carries between words). The + * output of this operation, "rand", is then used to make + * "M[rowOut][col] = M[rowOut][col] XOR rand" and + * "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit + * rotation to the left. + * + * @param state The current state of the sponge + * @param rowIn Row used only as input + * @param rowInOut Row used as input and to receive output after rotation + * @param rowOut Row receiving the output + * + */ +inline void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { + uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row* + uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev + uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row + int i; + + for (i = 0; i < N_COLS; i++) { + + //Absorbing "M[prev] [+] M[row*]" + state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); + state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); + state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); + state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); + state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); + state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); + state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); + state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); + state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); + state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); + state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); + state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); + + //Applies the reduced-round transformation f to the sponge's state + reducedBlake2bLyra(state); + + //M[rowOut][col] = M[rowOut][col] XOR rand + ptrWordOut[0] ^= state[0]; + ptrWordOut[1] ^= state[1]; + ptrWordOut[2] ^= state[2]; + ptrWordOut[3] ^= state[3]; + ptrWordOut[4] ^= state[4]; + ptrWordOut[5] ^= state[5]; + ptrWordOut[6] ^= state[6]; + ptrWordOut[7] ^= state[7]; + ptrWordOut[8] ^= state[8]; + ptrWordOut[9] ^= state[9]; + ptrWordOut[10] ^= state[10]; + ptrWordOut[11] ^= state[11]; + + //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand) + ptrWordInOut[0] ^= state[11]; + ptrWordInOut[1] ^= state[0]; + ptrWordInOut[2] ^= state[1]; + ptrWordInOut[3] ^= state[2]; + ptrWordInOut[4] ^= state[3]; + ptrWordInOut[5] ^= state[4]; + ptrWordInOut[6] ^= state[5]; + ptrWordInOut[7] ^= state[6]; + ptrWordInOut[8] ^= state[7]; + ptrWordInOut[9] ^= state[8]; + ptrWordInOut[10] ^= state[9]; + ptrWordInOut[11] ^= state[10]; + + //Goes to next block + ptrWordOut += BLOCK_LEN_INT64; + ptrWordInOut += BLOCK_LEN_INT64; + ptrWordIn += BLOCK_LEN_INT64; + } +} + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +/** + Prints an array of unsigned chars + */ +void printArrayO(unsigned char *array, unsigned int size, char *name) { + int i; + printf("%s: ", name); + for (i = 0; i < size; i++) { + printf("%2x|", array[i]); + } + printf("\n"); +} + +//////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/algorithm/Sponge_old.h b/algorithm/Sponge_old.h new file mode 100644 index 00000000..f8b7de25 --- /dev/null +++ b/algorithm/Sponge_old.h @@ -0,0 +1,98 @@ +/** + * Header file for Blake2b's internal permutation in the form of a sponge. + * This code is based on the original Blake2b's implementation provided by + * Samuel Neves (https://blake2.net/) + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef SPONGEOLD_H_ +#define SPONGEOLD_H_ + +#include + +#if defined(__GNUC__) +#define ALIGN __attribute__ ((aligned(32))) +#elif defined(_MSC_VER) +#define ALIGN __declspec(align(32)) +#else +#define ALIGN +#endif + + +/*Blake2b IV Array*/ +static const uint64_t blake2b_IV[8] = +{ + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/*Blake2b's rotation*/ +static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/*Blake2b's G function*/ +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) + + +/*One Round of the Blake2b's compression function*/ +#define ROUND_LYRA(r) \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); + + +//---- Housekeeping +void initStateO(uint64_t state[/*16*/]); + +//---- Squeezes +void squeezeO(uint64_t *state, unsigned char *out, unsigned int len); +void reducedSqueezeRow0O(uint64_t* state, uint64_t* row); + +//---- Absorbs +void absorbBlockO(uint64_t *state, const uint64_t *in); +void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); + +//---- Duplexes +void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); +void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); +void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); + +//---- Misc +void printArrayO(unsigned char *array, unsigned int size, char *name); + +//////////////////////////////////////////////////////////////////////////////////////////////// + + +#endif /* SPONGE_H_ */ diff --git a/driver-opencl.c b/driver-opencl.c index f8e68617..5b01fc96 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -258,14 +258,14 @@ char *set_gpu_threads(const char *_arg) if (nextptr == NULL) return "Invalid parameters for set_gpu_threads"; val = atoi(nextptr); - if (val < 1 || val > 10) + if (val < 1 || val > 20) // gpu_threads increase max value to 20 return "Invalid value passed to set_gpu_threads"; gpus[device++].threads = val; while ((nextptr = strtok(NULL, ",")) != NULL) { val = atoi(nextptr); - if (val < 1 || val > 10) + if (val < 1 || val > 20) // gpu_threads increase max value to 20 return "Invalid value passed to set_gpu_threads"; gpus[device++].threads = val; diff --git a/kernel/Lyra2REv2.cl b/kernel/Lyra2REv2.cl new file mode 100644 index 00000000..c1adbc37 --- /dev/null +++ b/kernel/Lyra2REv2.cl @@ -0,0 +1,525 @@ +/* + * Lyra2RE kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * Copyright (c) 2014 djm34 + * Copyright (c) 2014 James Lovejoy + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author djm34 + */ +// typedef unsigned int uint; +#pragma OPENCL EXTENSION cl_amd_printf : enable + +#ifndef LYRA2RE_CL +#define LYRA2RE_CL + +#if __ENDIAN_LITTLE__ +#define SPH_LITTLE_ENDIAN 1 +#else +#define SPH_BIG_ENDIAN 1 +#endif + +#define SPH_UPTR sph_u64 + +typedef unsigned int sph_u32; +typedef int sph_s32; +#ifndef __OPENCL_VERSION__ +typedef unsigned long sph_u64; +typedef long sph_s64; +#else +typedef unsigned long sph_u64; +typedef long sph_s64; +#endif + + +#define SPH_64 1 +#define SPH_64_TRUE 1 + +#define SPH_C32(x) ((sph_u32)(x ## U)) +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) + +#define SPH_C64(x) ((sph_u64)(x ## UL)) +#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) + +//#define SPH_ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +//#define SPH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +//#define SPH_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +//#define SPH_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) + +#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6 +#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n)) +#define SPH_ROTL64(x,n) rotate(x,(ulong)n) +#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n)) +static inline sph_u64 ror64(sph_u64 vw, unsigned a) { + uint2 result; + uint2 v = as_uint2(vw); + unsigned n = (unsigned)(64 - a); + if (n == 32) { return as_ulong((uint2)(v.y, v.x)); } + if (n < 32) { + result.y = ((v.y << (n)) | (v.x >> (32 - n))); + result.x = ((v.x << (n)) | (v.y >> (32 - n))); + } + else { + result.y = ((v.x << (n - 32)) | (v.y >> (64 - n))); + result.x = ((v.y << (n - 32)) | (v.x >> (64 - n))); + } + return as_ulong(result); +} + +//#define SPH_ROTR64(l,n) ror64(l,n) +#define memshift 3 +#include "blake256.cl" +#include "Lyra2v2.cl" +#include "keccak1600.cl" +#include "skein256.cl" +#include "cubehash.cl" +#include "bmw256.cl" + +#define SWAP4(x) as_uint(as_uchar4(x).wzyx) +#define SWAP8(x) as_ulong(as_uchar8(x).s76543210) +//#define SWAP8(x) as_ulong(as_uchar8(x).s32107654) +#if SPH_BIG_ENDIAN + #define DEC64E(x) (x) + #define DEC64BE(x) (*(const __global sph_u64 *) (x)); + #define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC32LE(x) (*(const __global sph_u32 *) (x)); +#else + #define DEC64E(x) SWAP8(x) + #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x)); + #define DEC64LE(x) (*(const __global sph_u64 *) (x)); +#define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x)); +#endif + +typedef union { + unsigned char h1[32]; + uint h4[8]; + ulong h8[4]; +} hash_t; + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search( + __global uchar* hashes, + // precalc hash from fisrt part of message + const uint h0, + const uint h1, + const uint h2, + const uint h3, + const uint h4, + const uint h5, + const uint h6, + const uint h7, + // last 12 bytes of original message + const uint in16, + const uint in17, + const uint in18 +) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + +// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + unsigned int h[8]; + unsigned int m[16]; + unsigned int v[16]; + + +h[0]=h0; +h[1]=h1; +h[2]=h2; +h[3]=h3; +h[4]=h4; +h[5]=h5; +h[6]=h6; +h[7]=h7; +// compress 2nd round + m[0] = in16; + m[1] = in17; + m[2] = in18; + m[3] = SWAP4(gid); + + for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];} + + for (int i = 0; i < 8; i++) {v[i] = h[i];} + + v[8] = c_u256[0]; + v[9] = c_u256[1]; + v[10] = c_u256[2]; + v[11] = c_u256[3]; + v[12] = c_u256[4] ^ 640; + v[13] = c_u256[5] ^ 640; + v[14] = c_u256[6]; + v[15] = c_u256[7]; + + for (int r = 0; r < 14; r++) { + GS(0, 4, 0x8, 0xC, 0x0); + GS(1, 5, 0x9, 0xD, 0x2); + GS(2, 6, 0xA, 0xE, 0x4); + GS(3, 7, 0xB, 0xF, 0x6); + GS(0, 5, 0xA, 0xF, 0x8); + GS(1, 6, 0xB, 0xC, 0xA); + GS(2, 7, 0x8, 0xD, 0xC); + GS(3, 4, 0x9, 0xE, 0xE); + } + + for (int i = 0; i < 16; i++) { + int j = i & 7; + h[j] ^= v[i];} + +for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);} + +barrier(CLK_LOCAL_MEM_FENCE); + +} + +// keccak256 + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global uchar* hashes) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + sph_u64 keccak_gpu_state[25]; + + for (int i = 0; i<25; i++) { + if (i<4) { keccak_gpu_state[i] = hash->h8[i]; } + else { keccak_gpu_state[i] = 0; } + } + keccak_gpu_state[4] = 0x0000000000000001; + keccak_gpu_state[16] = 0x8000000000000000; + + keccak_block(keccak_gpu_state); + for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; } +barrier(CLK_LOCAL_MEM_FENCE); + + + +} + +// cubehash256 + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global uchar* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71; + sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63; + sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696; + sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C; + sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00; + sph_u32 xf = 0x40E5FBAB; + + sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5; + sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549; + sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285; + sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD; + sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6; + sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A; + + x0 ^= (hash->h4[0]); + x1 ^= (hash->h4[1]); + x2 ^= (hash->h4[2]); + x3 ^= (hash->h4[3]); + x4 ^= (hash->h4[4]); + x5 ^= (hash->h4[5]); + x6 ^= (hash->h4[6]); + x7 ^= (hash->h4[7]); + + + SIXTEEN_ROUNDS; + x0 ^= 0x80; + SIXTEEN_ROUNDS; + xv ^= 0x01; + for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS; + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + + + barrier(CLK_GLOBAL_MEM_FENCE); + +} + + +/// lyra2 algo + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global uchar* hashes,__global uchar* matrix ) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + __global ulong4 *DMatrix = (__global ulong4 *)(matrix + (4 * memshift * 4 * 4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + +// uint offset = (4 * memshift * 4 * 4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))/32; + ulong4 state[4]; + + state[0].x = hash->h8[0]; //password + state[0].y = hash->h8[1]; //password + state[0].z = hash->h8[2]; //password + state[0].w = hash->h8[3]; //password + state[1] = state[0]; + state[2] = (ulong4)(0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL, 0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL); + state[3] = (ulong4)(0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL, 0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL); + for (int i = 0; i<12; i++) { round_lyra(state); } + + state[0] ^= (ulong4)(0x20,0x20,0x20,0x01); + state[1] ^= (ulong4)(0x04,0x04,0x80,0x0100000000000000); + + for (int i = 0; i<12; i++) { round_lyra(state); } + + + uint ps1 = (memshift * 3); +//#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 - memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix)[j+s1] = state[j]; + + round_lyra(state); + } + + reduceDuplexf(state,DMatrix); + + reduceDuplexRowSetupf(1, 0, 2,state, DMatrix); + reduceDuplexRowSetupf(2, 1, 3, state,DMatrix); + + + uint rowa; + uint prev = 3; + for (uint i = 0; i<4; i++) { + rowa = state[0].x & 3; + reduceDuplexRowf(prev, rowa, i, state, DMatrix); + prev = i; + } + + + + uint shift = (memshift * 4 * rowa); + + for (int j = 0; j < 3; j++) + state[j] ^= (DMatrix)[j+shift]; + + for (int i = 0; i < 12; i++) + round_lyra(state); +////////////////////////////////////// + + + for (int i = 0; i<4; i++) {hash->h8[i] = ((ulong*)state)[i];} +barrier(CLK_LOCAL_MEM_FENCE); + + + +} + +//skein256 + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search4(__global uchar* hashes) +{ + uint gid = get_global_id(0); + // __global hash_t *hash = &(hashes[gid-get_global_offset(0)]); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + + sph_u64 h[9]; + sph_u64 t[3]; + sph_u64 dt0,dt1,dt2,dt3; + sph_u64 p0, p1, p2, p3, p4, p5, p6, p7; + h[8] = skein_ks_parity; + + for (int i = 0; i<8; i++) { + h[i] = SKEIN_IV512_256[i]; + h[8] ^= h[i];} + + t[0]=t12[0]; + t[1]=t12[1]; + t[2]=t12[2]; + + dt0=hash->h8[0]; + dt1=hash->h8[1]; + dt2=hash->h8[2]; + dt3=hash->h8[3]; + + p0 = h[0] + dt0; + p1 = h[1] + dt1; + p2 = h[2] + dt2; + p3 = h[3] + dt3; + p4 = h[4]; + p5 = h[5] + t[0]; + p6 = h[6] + t[1]; + p7 = h[7]; + + #pragma unroll + for (int i = 1; i<19; i+=2) {Round_8_512(p0,p1,p2,p3,p4,p5,p6,p7,i);} + p0 ^= dt0; + p1 ^= dt1; + p2 ^= dt2; + p3 ^= dt3; + + h[0] = p0; + h[1] = p1; + h[2] = p2; + h[3] = p3; + h[4] = p4; + h[5] = p5; + h[6] = p6; + h[7] = p7; + h[8] = skein_ks_parity; + + for (int i = 0; i<8; i++) { h[8] ^= h[i]; } + + t[0] = t12[3]; + t[1] = t12[4]; + t[2] = t12[5]; + p5 += t[0]; //p5 already equal h[5] + p6 += t[1]; + + #pragma unroll + for (int i = 1; i<19; i+=2) { Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i); } + + hash->h8[0] = p0; + hash->h8[1] = p1; + hash->h8[2] = p2; + hash->h8[3] = p3; + barrier(CLK_LOCAL_MEM_FENCE); + +} + +//cubehash + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search5(__global uchar* hashes) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71; + sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63; + sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696; + sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C; + sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00; + sph_u32 xf = 0x40E5FBAB; + + sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5; + sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549; + sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285; + sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD; + sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6; + sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A; + + x0 ^= (hash->h4[0]); + x1 ^= (hash->h4[1]); + x2 ^= (hash->h4[2]); + x3 ^= (hash->h4[3]); + x4 ^= (hash->h4[4]); + x5 ^= (hash->h4[5]); + x6 ^= (hash->h4[6]); + x7 ^= (hash->h4[7]); + + + SIXTEEN_ROUNDS; + x0 ^= 0x80; + SIXTEEN_ROUNDS; + xv ^= 0x01; + for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS; + + hash->h4[0] = x0; + hash->h4[1] = x1; + hash->h4[2] = x2; + hash->h4[3] = x3; + hash->h4[4] = x4; + hash->h4[5] = x5; + hash->h4[6] = x6; + hash->h4[7] = x7; + + + barrier(CLK_GLOBAL_MEM_FENCE); + +} + + + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search6(__global uchar* hashes, __global uint* output, const ulong target) +{ + uint gid = get_global_id(0); + __global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))); + + uint dh[16] = { + (0x40414243), (0x44454647), + (0x48494A4B), (0x4C4D4E4F), + (0x50515253), (0x54555657), + (0x58595A5B), (0x5C5D5E5F), + (0x60616263), (0x64656667), + (0x68696A6B), (0x6C6D6E6F), + (0x70717273), (0x74757677), + (0x78797A7B), (0x7C7D7E7F) + }; + uint final_s[16] = { + (0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2), + (0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5), + (0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8), + (0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab), + (0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae), + (0xaaaaaaaf) + }; + + uint message[16]; + for (int i = 0; i<8; i++) message[i] = hash->h4[i]; + for (int i = 9; i<14; i++) message[i] = 0; + message[8]= 0x80; + message[14]=0x100; + message[15]=0; + + Compression256(message, dh); + Compression256(dh, final_s); + barrier(CLK_LOCAL_MEM_FENCE); + + + bool result = ( ((ulong*)final_s)[7] <= target); + if (result) { + output[atomic_inc(output + 0xFF)] = SWAP4(gid); + } + +} + + +#endif // LYRA2RE_CL \ No newline at end of file diff --git a/kernel/Lyra2v2.cl b/kernel/Lyra2v2.cl new file mode 100644 index 00000000..f9f9161d --- /dev/null +++ b/kernel/Lyra2v2.cl @@ -0,0 +1,184 @@ +/* +* Lyra2 kernel implementation. +* +* ==========================(LICENSE BEGIN)============================ +* Copyright (c) 2014 djm34 +* +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +* ===========================(LICENSE END)============================= +* +* @author djm34 +*/ + + + +#define ROTL64(x,n) rotate(x,(ulong)n) +#define ROTR64(x,n) rotate(x,(ulong)(64-n)) +#define SWAP32(x) as_ulong(as_uint2(x).s10) +#define SWAP24(x) as_ulong(as_uchar8(x).s34567012) +#define SWAP16(x) as_ulong(as_uchar8(x).s23456701) + +#define G(a,b,c,d) \ + do { \ + a += b; d ^= a; d = SWAP32(d); \ + c += d; b ^= c; b = ROTR64(b,24); \ + a += b; d ^= a; d = ROTR64(d,16); \ + c += d; b ^= c; b = ROTR64(b, 63); \ +\ + } while (0) + +#define G_old(a,b,c,d) \ + do { \ + a += b; d ^= a; d = ROTR64(d, 32); \ + c += d; b ^= c; b = ROTR64(b, 24); \ + a += b; d ^= a; d = ROTR64(d, 16); \ + c += d; b ^= c; b = ROTR64(b, 63); \ +\ + } while (0) + + +/*One Round of the Blake2b's compression function*/ + +#define round_lyra(s) \ + do { \ + G(s[0].x, s[1].x, s[2].x, s[3].x); \ + G(s[0].y, s[1].y, s[2].y, s[3].y); \ + G(s[0].z, s[1].z, s[2].z, s[3].z); \ + G(s[0].w, s[1].w, s[2].w, s[3].w); \ + G(s[0].x, s[1].y, s[2].z, s[3].w); \ + G(s[0].y, s[1].z, s[2].w, s[3].x); \ + G(s[0].z, s[1].w, s[2].x, s[3].y); \ + G(s[0].w, s[1].x, s[2].y, s[3].z); \ + } while(0) + + + +void reduceDuplexf(ulong4* state ,__global ulong4* DMatrix) +{ + + ulong4 state1[3]; + uint ps1 = 0; + uint ps2 = (memshift * 3 + memshift * 4); +//#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 - i*memshift; + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state[j] ^= state1[j]; + round_lyra(state); + for (int j = 0; j < 3; j++) state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state1[j]; + } + +} + + + +void reduceDuplexRowf(uint rowIn,uint rowInOut,uint rowOut,ulong4 * state, __global ulong4 * DMatrix) +{ + +ulong4 state1[3], state2[3]; +uint ps1 = (memshift * 4 * rowIn); +uint ps2 = (memshift * 4 * rowInOut); +uint ps3 = (memshift * 4 * rowOut); + + + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 + i*memshift; + uint s3 = ps3 + i*memshift; + + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2]; + + for (int j = 0; j < 3; j++) state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) state[j] ^= state1[j]; + + + round_lyra(state); + + ((ulong*)state2)[0] ^= ((ulong*)state)[11]; + for (int j = 0; j < 11; j++) + ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j]; + + if (rowInOut != rowOut) { + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s3] ^= state[j]; + } + else { + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + } + + } + } + + + + +void reduceDuplexRowSetupf(uint rowIn, uint rowInOut, uint rowOut, ulong4 *state, __global ulong4* DMatrix) { + + ulong4 state2[3], state1[3]; + uint ps1 = (memshift * 4 * rowIn); + uint ps2 = (memshift * 4 * rowInOut); + uint ps3 = (memshift * 3 + memshift * 4 * rowOut); + + for (int i = 0; i < 4; i++) + { + uint s1 = ps1 + i*memshift; + uint s2 = ps2 + i*memshift; + uint s3 = ps3 - i*memshift; + + for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1]; + + for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2]; + for (int j = 0; j < 3; j++) { + ulong4 tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + round_lyra(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix)[j + s3] = state1[j]; + } + + ((ulong*)state2)[0] ^= ((ulong*)state)[11]; + for (int j = 0; j < 11; j++) + ((ulong*)state2)[j + 1] ^= ((ulong*)state)[j]; + for (int j = 0; j < 3; j++) + (DMatrix)[j + s2] = state2[j]; + } + } + diff --git a/kernel/bmw256.cl b/kernel/bmw256.cl new file mode 100644 index 00000000..9d625340 --- /dev/null +++ b/kernel/bmw256.cl @@ -0,0 +1,128 @@ +#define shl(x, n) ((x) << (n)) +#define shr(x, n) ((x) >> (n)) +//#define SHR(x, n) SHR2(x, n) +//#define SHL(x, n) SHL2(x, n) + + +#define SPH_ROTL32(x,n) rotate(x,(uint)n) +#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19)) +#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23)) +#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25)) +#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29)) +#define ss4(x) (shr((x), 1) ^ (x)) +#define ss5(x) (shr((x), 2) ^ (x)) +#define rs1(x) SPH_ROTL32((x), 3) +#define rs2(x) SPH_ROTL32((x), 7) +#define rs3(x) SPH_ROTL32((x), 13) +#define rs4(x) SPH_ROTL32((x), 16) +#define rs5(x) SPH_ROTL32((x), 19) +#define rs6(x) SPH_ROTL32((x), 23) +#define rs7(x) SPH_ROTL32((x), 27) + +/* Message expansion function 1 */ +static uint expand32_1(int i, uint *M32, uint *H, uint *Q) +{ + + return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13]) + + ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9]) + + ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5]) + + ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); + +} + +/* Message expansion function 2 */ +static uint expand32_2(int i, uint *M32, uint *H, uint *Q) +{ + + return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13]) + + Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9]) + + Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5]) + + Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]) + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); + +} + +static void Compression256(uint *M32, uint *H) +{ + + int i; + uint XL32, XH32, Q[32]; + + + Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]); + Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]); + Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]); + Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]); + Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]); + Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]); + Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]); + Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]); + Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]); + Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]); + Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]); + Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]); + Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]); + Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]); + + /* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/ + Q[0] = ss0(Q[0]) + H[1]; + Q[1] = ss1(Q[1]) + H[2]; + Q[2] = ss2(Q[2]) + H[3]; + Q[3] = ss3(Q[3]) + H[4]; + Q[4] = ss4(Q[4]) + H[5]; + Q[5] = ss0(Q[5]) + H[6]; + Q[6] = ss1(Q[6]) + H[7]; + Q[7] = ss2(Q[7]) + H[8]; + Q[8] = ss3(Q[8]) + H[9]; + Q[9] = ss4(Q[9]) + H[10]; + Q[10] = ss0(Q[10]) + H[11]; + Q[11] = ss1(Q[11]) + H[12]; + Q[12] = ss2(Q[12]) + H[13]; + Q[13] = ss3(Q[13]) + H[14]; + Q[14] = ss4(Q[14]) + H[15]; + Q[15] = ss0(Q[15]) + H[0]; + + /* This is the Message expansion or f_1 in the documentation. */ + /* It has 16 rounds. */ + /* Blue Midnight Wish has two tunable security parameters. */ + /* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */ + /* The following relation for these parameters should is satisfied: */ + /* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ + + for (i = 0; i<2; i++) + Q[i + 16] = expand32_1(i + 16, M32, H, Q); + + for (i = 2; i<16; i++) + Q[i + 16] = expand32_2(i + 16, M32, H, Q); + + /* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ + /* 16 new variables that are prooduced in the Message Expansion part. */ + XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; + XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + + + /* This part is the function f_2 - in the documentation */ + + /* Compute the double chaining pipe for the next message block. */ + H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]); + H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]); + H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); + H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); + H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]); + H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]); + H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]); + H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]); + + H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]); + H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]); + H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]); + H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]); + H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]); + H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]); + H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); + H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); + +} diff --git a/kernel/cubehash256.cl b/kernel/cubehash256.cl new file mode 100644 index 00000000..9bc4c654 --- /dev/null +++ b/kernel/cubehash256.cl @@ -0,0 +1,132 @@ +// cubehash256 +// djm34 2015 based on ccminer cubehash512 + +#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ +#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ + + +#define LROT(x, bits) rotate( x,(uint) bits) + + +#define ROTATEUPWARDS7(a) LROT(a,7) +#define ROTATEUPWARDS11(a) LROT(a,11) + +#define SWAP(a,b) { uint u = a; a = b; b = u; } + +inline void rrounds(uint x[2][2][2][2][2]) +{ + int r; + int j; + int k; + int l; + int m; + +//#pragma unroll 2 + for (r = 0; r < CUBEHASH_ROUNDS; ++r) { + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) +//#pragma unroll 2 + for (m = 0; m < 2; ++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +//#pragma unroll 2 + for (j = 0; j < 2; ++j) +//#pragma unroll 2 + for (k = 0; k < 2; ++k) +//#pragma unroll 2 + for (l = 0; l < 2; ++l) + SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + + } +} + + diff --git a/kernel/skein256.cl b/kernel/skein256.cl index afbe7571..a7d85faf 100644 --- a/kernel/skein256.cl +++ b/kernel/skein256.cl @@ -48,14 +48,14 @@ __constant static const sph_u64 SKEIN_IV512_256[8] = { __constant static const int ROT256[8][4] = { - 46, 36, 19, 37, - 33, 27, 14, 42, - 17, 49, 36, 39, - 44, 9, 54, 56, - 39, 30, 34, 24, - 13, 50, 10, 17, - 25, 29, 39, 43, - 8, 35, 56, 22, + {46, 36, 19, 37}, + {33, 27, 14, 42}, + {17, 49, 36, 39}, + {44, 9, 54, 56}, + {39, 30, 34, 24}, + {13, 50, 10, 17}, + {25, 29, 39, 43}, + {8, 35, 56, 22} }; __constant static const sph_u64 skein_ks_parity = 0x1BD11BDAA9FC1A22; diff --git a/miner.h b/miner.h index 737974e6..09e8d422 100644 --- a/miner.h +++ b/miner.h @@ -1156,8 +1156,8 @@ extern struct pool *add_pool(void); extern bool add_pool_details(struct pool *pool, bool live, char *url, char *user, char *pass, char *name, char *desc, char *profile, char *algo); #define MAX_GPUDEVICES 16 -#define MAX_DEVICES 4096 - +//#define MAX_DEVICES 4096 +#define MAX_DEVICES 8192 #define MIN_INTENSITY 4 #define MIN_INTENSITY_STR "4" #define MAX_INTENSITY 31 @@ -1267,6 +1267,7 @@ struct stratum_work { size_t cb_len; size_t header_len; int merkles; + double next_diff; double diff; }; diff --git a/ocl.c b/ocl.c index 685ee98a..d1e8d7e5 100644 --- a/ocl.c +++ b/ocl.c @@ -37,6 +37,7 @@ #include "algorithm/neoscrypt.h" #include "algorithm/pluck.h" #include "algorithm/yescrypt.h" +#include "algorithm/Lyra2RE.h" /* FIXME: only here for global config vars, replace with configuration.h * or similar as soon as config is in a struct instead of littered all @@ -599,6 +600,88 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); + } + else if ( !safe_cmp(cgpu->algorithm.name, "lyra2REv2") ) { + size_t glob_thread_count; + long max_int; + unsigned char type = 0; + + // determine which intensity type to use + if (cgpu->rawintensity > 0) { + glob_thread_count = cgpu->rawintensity; + max_int = glob_thread_count; + type = 2; + } + else if (cgpu->xintensity > 0) { + glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity); + max_int = cgpu->xintensity; + type = 1; + } + else { + glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity); + max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity); + } + + glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count); + + // if TC * scratchbuf size is too big for memory... reduce to max + if ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) { + + /* Selected intensity will not run on this GPU. Not enough memory. + * Adapt the memory setting. */ + // depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc + switch (type) { + //raw intensity + case 2: + while ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) { + --glob_thread_count; + } + + max_int = glob_thread_count; + cgpu->rawintensity = glob_thread_count; + break; + + //x intensity + case 1: + glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE; + max_int = glob_thread_count / clState->compute_shaders; + + while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_XINTENSITY) { + applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu); + max_int = MIN_XINTENSITY; + } + + cgpu->xintensity = max_int; + glob_thread_count = clState->compute_shaders * (1UL << max_int); + break; + + default: + glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE; + while (max_int && ((1UL << max_int) & glob_thread_count) == 0) { + --max_int; + } + + /* Check if max_intensity is >0. */ + if (max_int < MIN_INTENSITY) { + applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu); + max_int = MIN_INTENSITY; + } + + cgpu->intensity = max_int; + glob_thread_count = 1UL << max_int; + break; + } + } + + // TC is glob thread count + cgpu->thread_concurrency = glob_thread_count; + + applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency)); } else if (!cgpu->opt_tc) { unsigned int sixtyfours; @@ -689,12 +772,17 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg if (clState->n_extra_kernels > 0) { unsigned int i; char kernel_name[9]; // max: search99 + 0x0 + char kernel_name2[10]; // max: search99 + 0x0 clState->extra_kernels = (cl_kernel *)malloc(sizeof(cl_kernel) * clState->n_extra_kernels); for (i = 0; i < clState->n_extra_kernels; i++) { - snprintf(kernel_name, 9, "%s%d", "search", i + 1); - clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name, &status); + if (i+1<100){ + snprintf(kernel_name, 9, "%s%d", "search", i + 1); + clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name, &status); + }else { + snprintf(kernel_name2, 10, "%s%d", "search", i + 1); + clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name2, &status);} if (status != CL_SUCCESS) { applog(LOG_DEBUG, "Error %d: Creating ExtraKernel #%d from program. (clCreateKernel)", status, i); return NULL; @@ -739,6 +827,21 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg + } + else if (!safe_cmp(algorithm->name, "lyra2REv2") ) { + /* The scratch/pad-buffer needs 32kBytes memory per thread. */ + bufsize = 4 * 4 * 12 * sizeof(unsigned long long) * cgpu->thread_concurrency; + buf1size = 4 * sizeof(unsigned long long) * cgpu->thread_concurrency; //matrix + + /* This is the input buffer. For yescrypt this is guaranteed to be + * 80 bytes only. */ + readbufsize = 80; + + applog(LOG_DEBUG, "lyra2REv2 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)buf1size); + // scrypt/n-scrypt + + + } else if (!safe_cmp(algorithm->name, "pluck")) { /* The scratch/pad-buffer needs 32kBytes memory per thread. */ @@ -798,7 +901,25 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg } -} + } + + else if (!safe_cmp(algorithm->name, "lyra2REv2") ) { + // need additionnal buffers + clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status); + if (status != CL_SUCCESS && !clState->buffer1) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; + } + + } + else { + clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); // we don't need that much just tired... + if (status != CL_SUCCESS && !clState->buffer1) { + applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); + return NULL; + } + + } /* This buffer is weird and might work to some degree even if * the create buffer call has apparently failed, so check if we @@ -808,6 +929,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status); return NULL; } + +// clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize/8, NULL, &status); // we don't need that much just tired... +// if (status != CL_SUCCESS && !clState->buffer1) { +// applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status); +// return NULL; +// } + } applog(LOG_DEBUG, "Using read buffer sized %lu", (unsigned long)readbufsize); diff --git a/sgminer.c b/sgminer.c index 9f10659c..fe64a414 100644 --- a/sgminer.c +++ b/sgminer.c @@ -2265,7 +2265,12 @@ static bool gbt_decode(struct pool *pool, json_t *res_val) static bool getwork_decode(json_t *res_val, struct work *work) { - if (unlikely(!jobj_binary(res_val, "data", work->data, sizeof(work->data), true))) { + + size_t worklen = 128; + worklen = ((!safe_cmp(work->pool->algorithm.name, "credits")) ? sizeof(work->data) : worklen); + + + if (unlikely(!jobj_binary(res_val, "data", work->data, worklen, true))) { if (opt_morenotices) applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), "")); return false; @@ -3030,7 +3035,11 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s } /* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */ - hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data))); + + int worksize_default = 128; + hexstr = bin2hex(work->data, (!safe_cmp(work->pool->algorithm.name, "neoscrypt") ? 80 : worksize_default)); + hexstr = bin2hex(work->data, (!safe_cmp(work->pool->algorithm.name, "credits") ? sizeof(work->data) : worksize_default)); + /* build JSON-RPC request */ if (work->gbt) { diff --git a/util.c b/util.c index 675cdad5..756ff7f0 100644 --- a/util.c +++ b/util.c @@ -1560,6 +1560,8 @@ static bool parse_notify(struct pool *pool, json_t *val) pool->swork.nbit = nbit; pool->swork.ntime = ntime; pool->swork.clean = clean; + pool->swork.diff = pool->swork.next_diff; + alloc_len = pool->swork.cb_len = cb1_len + pool->n1_len + pool->n2size + cb2_len; pool->nonce2_offset = cb1_len + pool->n1_len; @@ -1668,8 +1670,8 @@ static bool parse_diff(struct pool *pool, json_t *val) return false; cg_wlock(&pool->data_lock); - old_diff = pool->swork.diff; - pool->swork.diff = diff; + old_diff = pool->swork.next_diff; + pool->swork.next_diff = diff; cg_wunlock(&pool->data_lock); if (old_diff != diff) { @@ -2560,7 +2562,7 @@ out: if (!pool->stratum_url) pool->stratum_url = pool->sockaddr_url; pool->stratum_active = true; - pool->swork.diff = 1; + pool->swork.next_diff = pool->swork.diff = 1; if (opt_protocol) { applog(LOG_DEBUG, "%s confirmed mining.subscribe with extranonce1 %s extran2size %d", get_pool_name(pool), pool->nonce1, pool->n2size);