Browse Source

new vertcoin algorithm: Lyra2REv2

kept old lyra2RE for coins still using it (careful the original lyra2
had a bug, this one is corrected only in the new version)
implemented next_diff trick for pools
djm34
djm34 10 years ago
parent
commit
3e612cf335
  1. 1
      Makefile.am
  2. 67
      algorithm.c
  3. 1
      algorithm.h
  4. 15
      algorithm/Lyra2.c
  5. 8
      algorithm/Lyra2.h
  6. 27
      algorithm/Lyra2RE.c
  7. 3
      algorithm/Lyra2RE.h
  8. 169
      algorithm/Lyra2RE_old.c
  9. 10
      algorithm/Lyra2RE_old.h
  10. 208
      algorithm/Lyra2_old.c
  11. 50
      algorithm/Lyra2_old.h
  12. 27
      algorithm/Sponge.c
  13. 8
      algorithm/Sponge.h
  14. 405
      algorithm/Sponge_old.c
  15. 98
      algorithm/Sponge_old.h
  16. 4
      driver-opencl.c
  17. 525
      kernel/Lyra2REv2.cl
  18. 184
      kernel/Lyra2v2.cl
  19. 128
      kernel/bmw256.cl
  20. 132
      kernel/cubehash256.cl
  21. 16
      kernel/skein256.cl
  22. 5
      miner.h
  23. 134
      ocl.c
  24. 13
      sgminer.c
  25. 8
      util.c

1
Makefile.am

@ -70,6 +70,7 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h @@ -70,6 +70,7 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h
sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h
sgminer_SOURCES += algorithm/pluck.c algorithm/pluck.h
sgminer_SOURCES += algorithm/credits.c algorithm/credits.h
sgminer_SOURCES += algorithm/Lyra2RE_old.c algorithm/Lyra2RE_old.h algorithm/Lyra2_old.c algorithm/Lyra2_old.h algorithm/Sponge_old.c algorithm/Sponge_old.h
sgminer_SOURCES += algorithm/Lyra2RE.c algorithm/Lyra2RE.h algorithm/Lyra2.c algorithm/Lyra2.h algorithm/Sponge.c algorithm/Sponge.h
sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h

67
algorithm.c

@ -31,7 +31,8 @@ @@ -31,7 +31,8 @@
#include "algorithm/fresh.h"
#include "algorithm/whirlcoin.h"
#include "algorithm/neoscrypt.h"
#include "algorithm/Lyra2RE.h"
#include "algorithm/Lyra2RE.h" //lyra new version
#include "algorithm/Lyra2RE_old.h" //lyra old version
#include "algorithm/pluck.h"
#include "algorithm/yescrypt.h"
#include "algorithm/credits.h"
@ -59,6 +60,7 @@ const char *algorithm_type_str[] = { @@ -59,6 +60,7 @@ const char *algorithm_type_str[] = {
"Whirlcoin",
"Neoscrypt",
"Lyra2RE",
"Lyta2REv2"
"pluck",
"yescrypt",
"yescrypt-multi"
@ -409,6 +411,62 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct @@ -409,6 +411,62 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct
return status;
}
static cl_int queue_lyra2REv2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel;
unsigned int num;
cl_int status = 0;
cl_ulong le_target;
// le_target = *(cl_uint *)(blk->work->device_target + 28);
le_target = *(cl_ulong *)(blk->work->device_target + 24);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
// blake - search
kernel = &clState->kernel;
num = 0;
// CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->buffer1);
CL_SET_ARG(blk->work->blk.ctx_a);
CL_SET_ARG(blk->work->blk.ctx_b);
CL_SET_ARG(blk->work->blk.ctx_c);
CL_SET_ARG(blk->work->blk.ctx_d);
CL_SET_ARG(blk->work->blk.ctx_e);
CL_SET_ARG(blk->work->blk.ctx_f);
CL_SET_ARG(blk->work->blk.ctx_g);
CL_SET_ARG(blk->work->blk.ctx_h);
CL_SET_ARG(blk->work->blk.cty_a);
CL_SET_ARG(blk->work->blk.cty_b);
CL_SET_ARG(blk->work->blk.cty_c);
// keccak - search1
kernel = clState->extra_kernels;
CL_SET_ARG_0(clState->buffer1);
// cubehash - search2
num = 0;
CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
// lyra - search3
num = 0;
CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1);
CL_SET_ARG_N(1, clState->padbuffer8);
// skein -search4
num = 0;
CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
// cubehash - search5
num = 0;
CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
// bmw - search6
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->buffer1);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(le_target);
return status;
}
static cl_int queue_darkcoin_mod_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
@ -927,7 +985,10 @@ static algorithm_settings_t algos[] = { @@ -927,7 +985,10 @@ static algorithm_settings_t algos[] = {
{ "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL},
{ "Lyra2RE", ALGO_LYRA2RE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4,2 * 8 * 4194304 , 0, lyra2re_regenhash, queue_lyra2RE_kernel, gen_hash, NULL},
{ "Lyra2RE", ALGO_LYRA2RE, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4,2 * 8 * 4194304 , 0, lyra2reold_regenhash, queue_lyra2RE_kernel, gen_hash, NULL},
{ "Lyra2REv2", ALGO_LYRA2REv2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2re_regenhash, queue_lyra2REv2_kernel, gen_hash, append_neoscrypt_compiler_options },
// kernels starting from this will have difficulty calculated by using fuguecoin algorithm
#define A_FUGUE(a, b, c) \
@ -1035,7 +1096,7 @@ void set_algorithm(algorithm_t* algo, const char* newname_alias) @@ -1035,7 +1096,7 @@ void set_algorithm(algorithm_t* algo, const char* newname_alias)
// use old nfactor if it was previously set and is different than the one set by alias
if ((old_nfactor > 0) && (old_nfactor != nfactor))
nfactor = old_nfactor;
if (algo->type == ALGO_LYRA2RE) {opt_lyra = true;}
if (algo->type == ALGO_LYRA2RE || algo->type == ALGO_LYRA2REv2 ) { opt_lyra = true; }
set_algorithm_nfactor(algo, nfactor);
//reapply kernelfile if was set

1
algorithm.h

@ -28,6 +28,7 @@ typedef enum { @@ -28,6 +28,7 @@ typedef enum {
ALGO_WHIRL,
ALGO_NEOSCRYPT,
ALGO_LYRA2RE,
ALGO_LYRA2REv2,
ALGO_PLUCK,
ALGO_YESCRYPT,
ALGO_YESCRYPT_MULTI,

15
algorithm/Lyra2.c

@ -58,6 +58,11 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * @@ -58,6 +58,11 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
//========== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
uint64_t *wholeMatrix = malloc(i);
if (wholeMatrix == NULL) {
@ -130,16 +135,16 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * @@ -130,16 +135,16 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++) {
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1]);
reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
//updates the value of row* (deterministically picked during Setup))
@ -172,7 +177,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * @@ -172,7 +177,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
//------------------------------------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
//update prev: it now points to the last row ever computed
prev = row;

8
algorithm/Lyra2.h

@ -37,14 +37,6 @@ typedef unsigned char byte; @@ -37,14 +37,6 @@ typedef unsigned char byte;
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
#endif
#ifndef N_COLS
#define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default
#endif
#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row
int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
#endif /* LYRA2_H_ */

27
algorithm/Lyra2RE.c

@ -36,6 +36,8 @@ @@ -36,6 +36,8 @@
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_keccak.h"
#include "sph/sph_bmw.h"
#include "sph/sph_cubehash.h"
#include "Lyra2.h"
/*
@ -55,34 +57,37 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) @@ -55,34 +57,37 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
inline void lyra2rehash(void *state, const void *input)
{
sph_blake256_context ctx_blake;
sph_groestl256_context ctx_groestl;
sph_bmw256_context ctx_bmw;
sph_keccak256_context ctx_keccak;
sph_skein256_context ctx_skein;
sph_cubehash256_context ctx_cube;
uint32_t hashA[8], hashB[8];
sph_blake256_init(&ctx_blake);
sph_blake256 (&ctx_blake, input, 80);
sph_blake256_close (&ctx_blake, hashA);
sph_keccak256_init(&ctx_keccak);
sph_keccak256 (&ctx_keccak,hashA, 32);
sph_keccak256_close(&ctx_keccak, hashB);
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_cubehash256_init(&ctx_cube);
sph_cubehash256(&ctx_cube, hashB, 32);
sph_cubehash256_close(&ctx_cube, hashA);
LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
sph_skein256_init(&ctx_skein);
sph_skein256 (&ctx_skein, hashA, 32);
sph_skein256_close(&ctx_skein, hashB);
sph_skein256 (&ctx_skein, hashB, 32);
sph_skein256_close(&ctx_skein, hashA);
sph_cubehash256_init(&ctx_cube);
sph_cubehash256(&ctx_cube, hashA, 32);
sph_cubehash256_close(&ctx_cube, hashB);
sph_groestl256_init(&ctx_groestl);
sph_groestl256 (&ctx_groestl, hashB, 32);
sph_groestl256_close(&ctx_groestl, hashA);
sph_bmw256_init(&ctx_bmw);
sph_bmw256 (&ctx_bmw, hashB, 32);
sph_bmw256_close(&ctx_bmw, hashA);
//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]);

3
algorithm/Lyra2RE.h

@ -2,7 +2,8 @@ @@ -2,7 +2,8 @@
#define LYRA2RE_H
#include "miner.h"
#define LYRA_SCRATCHBUF_SIZE (4 ) // matrix extended to 16 matrix[16][8][8] uint64_t or equivalent
#define LYRA_SECBUF_SIZE (4) //8 uint64
extern int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget,
uint32_t nonce);
extern void lyra2re_regenhash(struct work *work);

169
algorithm/Lyra2RE_old.c

@ -0,0 +1,169 @@ @@ -0,0 +1,169 @@
/*-
* Copyright 2014 James Lovejoy
* Copyright 2014 phm
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "config.h"
#include "miner.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "sph/sph_blake.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_keccak.h"
#include "Lyra2_old.h"
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static inline void
be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
{
uint32_t i;
for (i = 0; i < len; i++)
dst[i] = htobe32(src[i]);
}
inline void lyra2rehash_old(void *state, const void *input)
{
sph_blake256_context ctx_blake;
sph_groestl256_context ctx_groestl;
sph_keccak256_context ctx_keccak;
sph_skein256_context ctx_skein;
uint32_t hashA[8], hashB[8];
sph_blake256_init(&ctx_blake);
sph_blake256 (&ctx_blake, input, 80);
sph_blake256_close (&ctx_blake, hashA);
sph_keccak256_init(&ctx_keccak);
sph_keccak256 (&ctx_keccak,hashA, 32);
sph_keccak256_close(&ctx_keccak, hashB);
LYRA2O(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_skein256_init(&ctx_skein);
sph_skein256 (&ctx_skein, hashA, 32);
sph_skein256_close(&ctx_skein, hashB);
sph_groestl256_init(&ctx_groestl);
sph_groestl256 (&ctx_groestl, hashB, 32);
sph_groestl256_close(&ctx_groestl, hashA);
//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]);
memcpy(state, hashA, 32);
}
static const uint32_t diff1targ = 0x0000ffff;
/* Used externally as confirmation of correct OCL code */
int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
{
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
uint32_t data[20], ohash[8];
be32enc_vect(data, (const uint32_t *)pdata, 19);
data[19] = htobe32(nonce);
lyra2rehash(ohash, data);
tmp_hash7 = be32toh(ohash[7]);
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
(long unsigned int)Htarg,
(long unsigned int)diff1targ,
(long unsigned int)tmp_hash7);
if (tmp_hash7 > diff1targ)
return -1;
if (tmp_hash7 > Htarg)
return 0;
return 1;
}
void lyra2reold_regenhash(struct work *work)
{
uint32_t data[20];
uint32_t *nonce = (uint32_t *)(work->data + 76);
uint32_t *ohash = (uint32_t *)(work->hash);
be32enc_vect(data, (const uint32_t *)work->data, 19);
data[19] = htobe32(*nonce);
lyra2rehash(ohash, data);
}
bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *pdata, unsigned char __maybe_unused *phash1,
unsigned char __maybe_unused *phash, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
{
uint32_t *nonce = (uint32_t *)(pdata + 76);
uint32_t data[20];
uint32_t tmp_hash7;
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
bool ret = false;
be32enc_vect(data, (const uint32_t *)pdata, 19);
while(1) {
uint32_t ostate[8];
*nonce = ++n;
data[19] = (n);
lyra2rehash(ostate, data);
tmp_hash7 = (ostate[7]);
applog(LOG_INFO, "data7 %08lx",
(long unsigned int)data[7]);
if (unlikely(tmp_hash7 <= Htarg)) {
((uint32_t *)pdata)[19] = htobe32(n);
*last_nonce = n;
ret = true;
break;
}
if (unlikely((n >= max_nonce) || thr->work_restart)) {
*last_nonce = n;
break;
}
}
return ret;
}

10
algorithm/Lyra2RE_old.h

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
#ifndef LYRA2REOLD_H
#define LYRA2REOLD_H
#include "miner.h"
extern int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget,
uint32_t nonce);
extern void lyra2reold_regenhash(struct work *work);
#endif /* LYRA2RE_H */

208
algorithm/Lyra2_old.c

@ -0,0 +1,208 @@ @@ -0,0 +1,208 @@
/**
* Implementation of the Lyra2 Password Hashing Scheme (PHS).
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "Lyra2_old.h"
#include "Sponge_old.h"
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
*
* @param K The derived key to be output by the algorithm
* @param kLen Desired key length
* @param pwd User password
* @param pwdlen Password length
* @param salt Salt
* @param saltlen Salt length
* @param timeCost Parameter to determine the processing time (T)
* @param nRows Number or rows of the memory matrix (R)
* @param nCols Number of columns of the memory matrix (C)
*
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
*/
int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) {
//============================= Basic variables ============================//
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
//==========================================================================/
//========== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
uint64_t *wholeMatrix = malloc(i);
if (wholeMatrix == NULL) {
return -1;
}
memset(wholeMatrix, 0, i);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
if (memMatrix == NULL) {
return -1;
}
//Places the pointers in the correct positions
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < nRows; i++) {
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
//==========================================================================/
//============= Getting the password + salt + basil padded with 10*1 ===============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
byte *ptrByte = (byte*) wholeMatrix;
memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &saltlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &timeCost, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nRows, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nCols, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//==========================================================================/
//======================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
uint64_t *state = malloc(16 * sizeof (uint64_t));
if (state == NULL) {
return -1;
}
initState(state);
//==========================================================================/
//================================ Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++) {
absorbBlockBlake2SafeO(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0O(state, memMatrix[0]); //The locally copied password is most likely overwritten here
reducedDuplexRow1O(state, memMatrix[0], memMatrix[1]);
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetupO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < nRows);
//==========================================================================/
//============================ Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= timeCost; tau++) {
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
do {
//Selects a pseudorandom index row*
//------------------------------------------------------------------------------------------
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRowO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//------------------------------------------------------------------------------------------
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
} while (row != 0);
}
//==========================================================================/
//============================ Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlockO(state, memMatrix[rowa]);
//Squeezes the key
squeezeO(state, K, kLen);
//==========================================================================/
//========================= Freeing the memory =============================//
free(memMatrix);
free(wholeMatrix);
//Wiping out the sponge's internal state before freeing it
memset(state, 0, 16 * sizeof (uint64_t));
free(state);
//==========================================================================/
return 0;
}

50
algorithm/Lyra2_old.h

@ -0,0 +1,50 @@ @@ -0,0 +1,50 @@
/**
* Header file for the Lyra2 Password Hashing Scheme (PHS).
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef LYRA2OLD_H_
#define LYRA2OLD_H_
#include <stdint.h>
typedef unsigned char byte;
//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t)
#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes
#ifdef BLOCK_LEN_BITS
#define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t)
#define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes
#else //default block lenght: 768 bits
#define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t)
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
#endif
#ifndef N_COLS
#define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default
#endif
#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row
int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
#endif /* LYRA2_H_ */

27
algorithm/Sponge.c

@ -137,7 +137,8 @@ inline void absorbBlock(uint64_t *state, const uint64_t *in) { @@ -137,7 +137,8 @@ inline void absorbBlock(uint64_t *state, const uint64_t *in) {
*/
inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
state[0] ^= in[0];
state[0] ^= in[0];
state[1] ^= in[1];
state[2] ^= in[2];
state[3] ^= in[3];
@ -146,8 +147,10 @@ inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { @@ -146,8 +147,10 @@ inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
state[6] ^= in[6];
state[7] ^= in[7];
//Applies the transformation f to the sponge's state
blake2bLyra(state);
}
/**
@ -158,11 +161,11 @@ inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { @@ -158,11 +161,11 @@ inline void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
* @param state The current state of the sponge
* @param rowOut Row to receive the data squeezed
*/
inline void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
inline void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, uint64_t nCols) {
uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
int i;
//M[row][C-1-col] = H.reduced_squeeze()
for (i = 0; i < N_COLS; i++) {
for (i = 0; i < nCols; i++) {
ptrWord[0] = state[0];
ptrWord[1] = state[1];
ptrWord[2] = state[2];
@ -193,12 +196,12 @@ inline void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { @@ -193,12 +196,12 @@ inline void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
* @param rowIn Row to feed the sponge
* @param rowOut Row to receive the sponge's output
*/
inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
for (i = 0; i < nCols; i++) {
//Absorbing "M[prev][col]"
state[0] ^= (ptrWordIn[0]);
@ -253,13 +256,13 @@ inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut @@ -253,13 +256,13 @@ inline void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut
* @param rowOut Row receiving the output
*
*/
inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
for (i = 0; i < nCols; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
@ -327,13 +330,13 @@ inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *ro @@ -327,13 +330,13 @@ inline void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *ro
* @param rowOut Row receiving the output
*
*/
inline void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
inline void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) {
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
for (i = 0; i < nCols; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);

8
algorithm/Sponge.h

@ -78,16 +78,16 @@ void initState(uint64_t state[/*16*/]); @@ -78,16 +78,16 @@ void initState(uint64_t state[/*16*/]);
//---- Squeezes
void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
void reducedSqueezeRow0(uint64_t* state, uint64_t* row);
void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
//---- Absorbs
void absorbBlock(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
//---- Duplexes
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
//---- Misc
void printArray(unsigned char *array, unsigned int size, char *name);

405
algorithm/Sponge_old.c

@ -0,0 +1,405 @@ @@ -0,0 +1,405 @@
/**
* A simple implementation of Blake2b's internal permutation
* in the form of a sponge.
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <string.h>
#include <stdio.h>
#include <time.h>
#include "Sponge_old.h"
#include "Lyra2_old.h"
/**
* Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
* receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
* typically have their internal state initialized with zeros, Blake2b's G function
* has a fixed point: if the internal state and message are both filled with zeros. the
* resulting permutation will always be a block filled with zeros; this happens because
* Blake2b does not use the constants originally employed in Blake2 inside its G function,
* relying on the IV for avoiding possible fixed points.
*
* @param state The 1024-bit array to be initialized
*/
inline void initStateO(uint64_t state[/*16*/]) {
//First 512 bis are zeros
memset(state, 0, 64);
//Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
state[8] = blake2b_IV[0];
state[9] = blake2b_IV[1];
state[10] = blake2b_IV[2];
state[11] = blake2b_IV[3];
state[12] = blake2b_IV[4];
state[13] = blake2b_IV[5];
state[14] = blake2b_IV[6];
state[15] = blake2b_IV[7];
}
/**
* Execute Blake2b's G function, with all 12 rounds.
*
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
*/
inline static void blake2bLyra(uint64_t *v) {
ROUND_LYRA(0);
ROUND_LYRA(1);
ROUND_LYRA(2);
ROUND_LYRA(3);
ROUND_LYRA(4);
ROUND_LYRA(5);
ROUND_LYRA(6);
ROUND_LYRA(7);
ROUND_LYRA(8);
ROUND_LYRA(9);
ROUND_LYRA(10);
ROUND_LYRA(11);
}
/**
* Executes a reduced version of Blake2b's G function with only one round
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
*/
inline static void reducedBlake2bLyra(uint64_t *v) {
ROUND_LYRA(0);
}
/**
* Performs a squeeze operation, using Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param out Array that will receive the data squeezed
* @param len The number of bytes to be squeezed into the "out" array
*/
inline void squeezeO(uint64_t *state, byte *out, unsigned int len) {
int fullBlocks = len / BLOCK_LEN_BYTES;
byte *ptr = out;
int i;
//Squeezes full blocks
for (i = 0; i < fullBlocks; i++) {
memcpy(ptr, state, BLOCK_LEN_BYTES);
blake2bLyra(state);
ptr += BLOCK_LEN_BYTES;
}
//Squeezes remaining bytes
memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
}
/**
* Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
* of type uint64_t), using Blake2b's G function as the internal permutation
*
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_INT64 words)
*/
inline void absorbBlockO(uint64_t *state, const uint64_t *in) {
//XORs the first BLOCK_LEN_INT64 words of "in" with the current state
state[0] ^= in[0];
state[1] ^= in[1];
state[2] ^= in[2];
state[3] ^= in[3];
state[4] ^= in[4];
state[5] ^= in[5];
state[6] ^= in[6];
state[7] ^= in[7];
state[8] ^= in[8];
state[9] ^= in[9];
state[10] ^= in[10];
state[11] ^= in[11];
//Applies the transformation f to the sponge's state
blake2bLyra(state);
}
/**
* Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64
* words of type uint64_t), using Blake2b's G function as the internal permutation
*
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
*/
inline void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in) {
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
state[0] ^= in[0];
state[1] ^= in[1];
state[2] ^= in[2];
state[3] ^= in[3];
state[4] ^= in[4];
state[5] ^= in[5];
state[6] ^= in[6];
state[7] ^= in[7];
//Applies the transformation f to the sponge's state
blake2bLyra(state);
}
/**
* Performs a reduced squeeze operation for a single row, from the highest to
* the lowest index, using the reduced-round Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param rowOut Row to receive the data squeezed
*/
inline void reducedSqueezeRow0O(uint64_t* state, uint64_t* rowOut) {
uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
int i;
//M[row][C-1-col] = H.reduced_squeeze()
for (i = 0; i < N_COLS; i++) {
ptrWord[0] = state[0];
ptrWord[1] = state[1];
ptrWord[2] = state[2];
ptrWord[3] = state[3];
ptrWord[4] = state[4];
ptrWord[5] = state[5];
ptrWord[6] = state[6];
ptrWord[7] = state[7];
ptrWord[8] = state[8];
ptrWord[9] = state[9];
ptrWord[10] = state[10];
ptrWord[11] = state[11];
//Goes to next block (column) that will receive the squeezed data
ptrWord -= BLOCK_LEN_INT64;
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
}
}
/**
* Performs a reduced duplex operation for a single row, from the highest to
* the lowest index, using the reduced-round Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param rowIn Row to feed the sponge
* @param rowOut Row to receive the sponge's output
*/
inline void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev][col]"
state[0] ^= (ptrWordIn[0]);
state[1] ^= (ptrWordIn[1]);
state[2] ^= (ptrWordIn[2]);
state[3] ^= (ptrWordIn[3]);
state[4] ^= (ptrWordIn[4]);
state[5] ^= (ptrWordIn[5]);
state[6] ^= (ptrWordIn[6]);
state[7] ^= (ptrWordIn[7]);
state[8] ^= (ptrWordIn[8]);
state[9] ^= (ptrWordIn[9]);
state[10] ^= (ptrWordIn[10]);
state[11] ^= (ptrWordIn[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row][C-1-col] = M[prev][col] XOR rand
ptrWordOut[0] = ptrWordIn[0] ^ state[0];
ptrWordOut[1] = ptrWordIn[1] ^ state[1];
ptrWordOut[2] = ptrWordIn[2] ^ state[2];
ptrWordOut[3] = ptrWordIn[3] ^ state[3];
ptrWordOut[4] = ptrWordIn[4] ^ state[4];
ptrWordOut[5] = ptrWordIn[5] ^ state[5];
ptrWordOut[6] = ptrWordIn[6] ^ state[6];
ptrWordOut[7] = ptrWordIn[7] ^ state[7];
ptrWordOut[8] = ptrWordIn[8] ^ state[8];
ptrWordOut[9] = ptrWordIn[9] ^ state[9];
ptrWordOut[10] = ptrWordIn[10] ^ state[10];
ptrWordOut[11] = ptrWordIn[11] ^ state[11];
//Input: next column (i.e., next block in sequence)
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
}
}
/**
* Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
* the wordwise addition of two columns, ignoring carries between words). The
* output of this operation, "rand", is then used to make
* "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and
* "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left and N_COLS is a system parameter.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
inline void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row][col] = M[prev][col] XOR rand
ptrWordOut[0] = ptrWordIn[0] ^ state[0];
ptrWordOut[1] = ptrWordIn[1] ^ state[1];
ptrWordOut[2] = ptrWordIn[2] ^ state[2];
ptrWordOut[3] = ptrWordIn[3] ^ state[3];
ptrWordOut[4] = ptrWordIn[4] ^ state[4];
ptrWordOut[5] = ptrWordIn[5] ^ state[5];
ptrWordOut[6] = ptrWordIn[6] ^ state[6];
ptrWordOut[7] = ptrWordIn[7] ^ state[7];
ptrWordOut[8] = ptrWordIn[8] ^ state[8];
ptrWordOut[9] = ptrWordIn[9] ^ state[9];
ptrWordOut[10] = ptrWordIn[10] ^ state[10];
ptrWordOut[11] = ptrWordIn[11] ^ state[11];
//M[row*][col] = M[row*][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[11];
ptrWordInOut[1] ^= state[0];
ptrWordInOut[2] ^= state[1];
ptrWordInOut[3] ^= state[2];
ptrWordInOut[4] ^= state[3];
ptrWordInOut[5] ^= state[4];
ptrWordInOut[6] ^= state[5];
ptrWordInOut[7] ^= state[6];
ptrWordInOut[8] ^= state[7];
ptrWordInOut[9] ^= state[8];
ptrWordInOut[10] ^= state[9];
ptrWordInOut[11] ^= state[10];
//Inputs: next column (i.e., next block in sequence)
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
}
}
/**
* Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
* the wordwise addition of two columns, ignoring carries between words). The
* output of this operation, "rand", is then used to make
* "M[rowOut][col] = M[rowOut][col] XOR rand" and
* "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
inline void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[rowOut][col] = M[rowOut][col] XOR rand
ptrWordOut[0] ^= state[0];
ptrWordOut[1] ^= state[1];
ptrWordOut[2] ^= state[2];
ptrWordOut[3] ^= state[3];
ptrWordOut[4] ^= state[4];
ptrWordOut[5] ^= state[5];
ptrWordOut[6] ^= state[6];
ptrWordOut[7] ^= state[7];
ptrWordOut[8] ^= state[8];
ptrWordOut[9] ^= state[9];
ptrWordOut[10] ^= state[10];
ptrWordOut[11] ^= state[11];
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[11];
ptrWordInOut[1] ^= state[0];
ptrWordInOut[2] ^= state[1];
ptrWordInOut[3] ^= state[2];
ptrWordInOut[4] ^= state[3];
ptrWordInOut[5] ^= state[4];
ptrWordInOut[6] ^= state[5];
ptrWordInOut[7] ^= state[6];
ptrWordInOut[8] ^= state[7];
ptrWordInOut[9] ^= state[8];
ptrWordInOut[10] ^= state[9];
ptrWordInOut[11] ^= state[10];
//Goes to next block
ptrWordOut += BLOCK_LEN_INT64;
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/**
Prints an array of unsigned chars
*/
void printArrayO(unsigned char *array, unsigned int size, char *name) {
int i;
printf("%s: ", name);
for (i = 0; i < size; i++) {
printf("%2x|", array[i]);
}
printf("\n");
}
////////////////////////////////////////////////////////////////////////////////////////////////

98
algorithm/Sponge_old.h

@ -0,0 +1,98 @@ @@ -0,0 +1,98 @@
/**
* Header file for Blake2b's internal permutation in the form of a sponge.
* This code is based on the original Blake2b's implementation provided by
* Samuel Neves (https://blake2.net/)
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SPONGEOLD_H_
#define SPONGEOLD_H_
#include <stdint.h>
#if defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(32)))
#elif defined(_MSC_VER)
#define ALIGN __declspec(align(32))
#else
#define ALIGN
#endif
/*Blake2b IV Array*/
static const uint64_t blake2b_IV[8] =
{
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
/*Blake2b's rotation*/
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
/*Blake2b's G function*/
#define G(r,i,a,b,c,d) \
do { \
a = a + b; \
d = rotr64(d ^ a, 32); \
c = c + d; \
b = rotr64(b ^ c, 24); \
a = a + b; \
d = rotr64(d ^ a, 16); \
c = c + d; \
b = rotr64(b ^ c, 63); \
} while(0)
/*One Round of the Blake2b's compression function*/
#define ROUND_LYRA(r) \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
G(r,2,v[ 2],v[ 6],v[10],v[14]); \
G(r,3,v[ 3],v[ 7],v[11],v[15]); \
G(r,4,v[ 0],v[ 5],v[10],v[15]); \
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
//---- Housekeeping
void initStateO(uint64_t state[/*16*/]);
//---- Squeezes
void squeezeO(uint64_t *state, unsigned char *out, unsigned int len);
void reducedSqueezeRow0O(uint64_t* state, uint64_t* row);
//---- Absorbs
void absorbBlockO(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
//---- Duplexes
void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
//---- Misc
void printArrayO(unsigned char *array, unsigned int size, char *name);
////////////////////////////////////////////////////////////////////////////////////////////////
#endif /* SPONGE_H_ */

4
driver-opencl.c

@ -258,14 +258,14 @@ char *set_gpu_threads(const char *_arg) @@ -258,14 +258,14 @@ char *set_gpu_threads(const char *_arg)
if (nextptr == NULL)
return "Invalid parameters for set_gpu_threads";
val = atoi(nextptr);
if (val < 1 || val > 10)
if (val < 1 || val > 20) // gpu_threads increase max value to 20
return "Invalid value passed to set_gpu_threads";
gpus[device++].threads = val;
while ((nextptr = strtok(NULL, ",")) != NULL) {
val = atoi(nextptr);
if (val < 1 || val > 10)
if (val < 1 || val > 20) // gpu_threads increase max value to 20
return "Invalid value passed to set_gpu_threads";
gpus[device++].threads = val;

525
kernel/Lyra2REv2.cl

@ -0,0 +1,525 @@ @@ -0,0 +1,525 @@
/*
* Lyra2RE kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
* Copyright (c) 2014 djm34
* Copyright (c) 2014 James Lovejoy
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
// typedef unsigned int uint;
#pragma OPENCL EXTENSION cl_amd_printf : enable
#ifndef LYRA2RE_CL
#define LYRA2RE_CL
#if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1
#else
#define SPH_BIG_ENDIAN 1
#endif
#define SPH_UPTR sph_u64
typedef unsigned int sph_u32;
typedef int sph_s32;
#ifndef __OPENCL_VERSION__
typedef unsigned long sph_u64;
typedef long sph_s64;
#else
typedef unsigned long sph_u64;
typedef long sph_s64;
#endif
#define SPH_64 1
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
//#define SPH_ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
//#define SPH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
//#define SPH_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
//#define SPH_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6
#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n))
#define SPH_ROTL64(x,n) rotate(x,(ulong)n)
#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n))
static inline sph_u64 ror64(sph_u64 vw, unsigned a) {
uint2 result;
uint2 v = as_uint2(vw);
unsigned n = (unsigned)(64 - a);
if (n == 32) { return as_ulong((uint2)(v.y, v.x)); }
if (n < 32) {
result.y = ((v.y << (n)) | (v.x >> (32 - n)));
result.x = ((v.x << (n)) | (v.y >> (32 - n)));
}
else {
result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
}
return as_ulong(result);
}
//#define SPH_ROTR64(l,n) ror64(l,n)
#define memshift 3
#include "blake256.cl"
#include "Lyra2v2.cl"
#include "keccak1600.cl"
#include "skein256.cl"
#include "cubehash.cl"
#include "bmw256.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
//#define SWAP8(x) as_ulong(as_uchar8(x).s32107654)
#if SPH_BIG_ENDIAN
#define DEC64E(x) (x)
#define DEC64BE(x) (*(const __global sph_u64 *) (x));
#define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x));
#define DEC32LE(x) (*(const __global sph_u32 *) (x));
#else
#define DEC64E(x) SWAP8(x)
#define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
#define DEC64LE(x) (*(const __global sph_u64 *) (x));
#define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x));
#endif
typedef union {
unsigned char h1[32];
uint h4[8];
ulong h8[4];
} hash_t;
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(
__global uchar* hashes,
// precalc hash from fisrt part of message
const uint h0,
const uint h1,
const uint h2,
const uint h3,
const uint h4,
const uint h5,
const uint h6,
const uint h7,
// last 12 bytes of original message
const uint in16,
const uint in17,
const uint in18
)
{
uint gid = get_global_id(0);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
unsigned int h[8];
unsigned int m[16];
unsigned int v[16];
h[0]=h0;
h[1]=h1;
h[2]=h2;
h[3]=h3;
h[4]=h4;
h[5]=h5;
h[6]=h6;
h[7]=h7;
// compress 2nd round
m[0] = in16;
m[1] = in17;
m[2] = in18;
m[3] = SWAP4(gid);
for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];}
for (int i = 0; i < 8; i++) {v[i] = h[i];}
v[8] = c_u256[0];
v[9] = c_u256[1];
v[10] = c_u256[2];
v[11] = c_u256[3];
v[12] = c_u256[4] ^ 640;
v[13] = c_u256[5] ^ 640;
v[14] = c_u256[6];
v[15] = c_u256[7];
for (int r = 0; r < 14; r++) {
GS(0, 4, 0x8, 0xC, 0x0);
GS(1, 5, 0x9, 0xD, 0x2);
GS(2, 6, 0xA, 0xE, 0x4);
GS(3, 7, 0xB, 0xF, 0x6);
GS(0, 5, 0xA, 0xF, 0x8);
GS(1, 6, 0xB, 0xC, 0xA);
GS(2, 7, 0x8, 0xD, 0xC);
GS(3, 4, 0x9, 0xE, 0xE);
}
for (int i = 0; i < 16; i++) {
int j = i & 7;
h[j] ^= v[i];}
for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);}
barrier(CLK_LOCAL_MEM_FENCE);
}
// keccak256
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search1(__global uchar* hashes)
{
uint gid = get_global_id(0);
// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
sph_u64 keccak_gpu_state[25];
for (int i = 0; i<25; i++) {
if (i<4) { keccak_gpu_state[i] = hash->h8[i]; }
else { keccak_gpu_state[i] = 0; }
}
keccak_gpu_state[4] = 0x0000000000000001;
keccak_gpu_state[16] = 0x8000000000000000;
keccak_block(keccak_gpu_state);
for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; }
barrier(CLK_LOCAL_MEM_FENCE);
}
// cubehash256
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search2(__global uchar* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71;
sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63;
sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696;
sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C;
sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00;
sph_u32 xf = 0x40E5FBAB;
sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5;
sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549;
sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285;
sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD;
sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6;
sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A;
x0 ^= (hash->h4[0]);
x1 ^= (hash->h4[1]);
x2 ^= (hash->h4[2]);
x3 ^= (hash->h4[3]);
x4 ^= (hash->h4[4]);
x5 ^= (hash->h4[5]);
x6 ^= (hash->h4[6]);
x7 ^= (hash->h4[7]);
SIXTEEN_ROUNDS;
x0 ^= 0x80;
SIXTEEN_ROUNDS;
xv ^= 0x01;
for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS;
hash->h4[0] = x0;
hash->h4[1] = x1;
hash->h4[2] = x2;
hash->h4[3] = x3;
hash->h4[4] = x4;
hash->h4[5] = x5;
hash->h4[6] = x6;
hash->h4[7] = x7;
barrier(CLK_GLOBAL_MEM_FENCE);
}
/// lyra2 algo
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search3(__global uchar* hashes,__global uchar* matrix )
{
uint gid = get_global_id(0);
// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong4 *DMatrix = (__global ulong4 *)(matrix + (4 * memshift * 4 * 4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
// uint offset = (4 * memshift * 4 * 4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))/32;
ulong4 state[4];
state[0].x = hash->h8[0]; //password
state[0].y = hash->h8[1]; //password
state[0].z = hash->h8[2]; //password
state[0].w = hash->h8[3]; //password
state[1] = state[0];
state[2] = (ulong4)(0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL, 0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL);
state[3] = (ulong4)(0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL, 0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL);
for (int i = 0; i<12; i++) { round_lyra(state); }
state[0] ^= (ulong4)(0x20,0x20,0x20,0x01);
state[1] ^= (ulong4)(0x04,0x04,0x80,0x0100000000000000);
for (int i = 0; i<12; i++) { round_lyra(state); }
uint ps1 = (memshift * 3);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 - memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix)[j+s1] = state[j];
round_lyra(state);
}
reduceDuplexf(state,DMatrix);
reduceDuplexRowSetupf(1, 0, 2,state, DMatrix);
reduceDuplexRowSetupf(2, 1, 3, state,DMatrix);
uint rowa;
uint prev = 3;
for (uint i = 0; i<4; i++) {
rowa = state[0].x & 3;
reduceDuplexRowf(prev, rowa, i, state, DMatrix);
prev = i;
}
uint shift = (memshift * 4 * rowa);
for (int j = 0; j < 3; j++)
state[j] ^= (DMatrix)[j+shift];
for (int i = 0; i < 12; i++)
round_lyra(state);
//////////////////////////////////////
for (int i = 0; i<4; i++) {hash->h8[i] = ((ulong*)state)[i];}
barrier(CLK_LOCAL_MEM_FENCE);
}
//skein256
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search4(__global uchar* hashes)
{
uint gid = get_global_id(0);
// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
sph_u64 h[9];
sph_u64 t[3];
sph_u64 dt0,dt1,dt2,dt3;
sph_u64 p0, p1, p2, p3, p4, p5, p6, p7;
h[8] = skein_ks_parity;
for (int i = 0; i<8; i++) {
h[i] = SKEIN_IV512_256[i];
h[8] ^= h[i];}
t[0]=t12[0];
t[1]=t12[1];
t[2]=t12[2];
dt0=hash->h8[0];
dt1=hash->h8[1];
dt2=hash->h8[2];
dt3=hash->h8[3];
p0 = h[0] + dt0;
p1 = h[1] + dt1;
p2 = h[2] + dt2;
p3 = h[3] + dt3;
p4 = h[4];
p5 = h[5] + t[0];
p6 = h[6] + t[1];
p7 = h[7];
#pragma unroll
for (int i = 1; i<19; i+=2) {Round_8_512(p0,p1,p2,p3,p4,p5,p6,p7,i);}
p0 ^= dt0;
p1 ^= dt1;
p2 ^= dt2;
p3 ^= dt3;
h[0] = p0;
h[1] = p1;
h[2] = p2;
h[3] = p3;
h[4] = p4;
h[5] = p5;
h[6] = p6;
h[7] = p7;
h[8] = skein_ks_parity;
for (int i = 0; i<8; i++) { h[8] ^= h[i]; }
t[0] = t12[3];
t[1] = t12[4];
t[2] = t12[5];
p5 += t[0]; //p5 already equal h[5]
p6 += t[1];
#pragma unroll
for (int i = 1; i<19; i+=2) { Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i); }
hash->h8[0] = p0;
hash->h8[1] = p1;
hash->h8[2] = p2;
hash->h8[3] = p3;
barrier(CLK_LOCAL_MEM_FENCE);
}
//cubehash
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search5(__global uchar* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71;
sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63;
sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696;
sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C;
sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00;
sph_u32 xf = 0x40E5FBAB;
sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5;
sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549;
sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285;
sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD;
sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6;
sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A;
x0 ^= (hash->h4[0]);
x1 ^= (hash->h4[1]);
x2 ^= (hash->h4[2]);
x3 ^= (hash->h4[3]);
x4 ^= (hash->h4[4]);
x5 ^= (hash->h4[5]);
x6 ^= (hash->h4[6]);
x7 ^= (hash->h4[7]);
SIXTEEN_ROUNDS;
x0 ^= 0x80;
SIXTEEN_ROUNDS;
xv ^= 0x01;
for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS;
hash->h4[0] = x0;
hash->h4[1] = x1;
hash->h4[2] = x2;
hash->h4[3] = x3;
hash->h4[4] = x4;
hash->h4[5] = x5;
hash->h4[6] = x6;
hash->h4[7] = x7;
barrier(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search6(__global uchar* hashes, __global uint* output, const ulong target)
{
uint gid = get_global_id(0);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
uint dh[16] = {
(0x40414243), (0x44454647),
(0x48494A4B), (0x4C4D4E4F),
(0x50515253), (0x54555657),
(0x58595A5B), (0x5C5D5E5F),
(0x60616263), (0x64656667),
(0x68696A6B), (0x6C6D6E6F),
(0x70717273), (0x74757677),
(0x78797A7B), (0x7C7D7E7F)
};
uint final_s[16] = {
(0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2),
(0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5),
(0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8),
(0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab),
(0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae),
(0xaaaaaaaf)
};
uint message[16];
for (int i = 0; i<8; i++) message[i] = hash->h4[i];
for (int i = 9; i<14; i++) message[i] = 0;
message[8]= 0x80;
message[14]=0x100;
message[15]=0;
Compression256(message, dh);
Compression256(dh, final_s);
barrier(CLK_LOCAL_MEM_FENCE);
bool result = ( ((ulong*)final_s)[7] <= target);
if (result) {
output[atomic_inc(output + 0xFF)] = SWAP4(gid);
}
}
#endif // LYRA2RE_CL

184
kernel/Lyra2v2.cl

@ -0,0 +1,184 @@ @@ -0,0 +1,184 @@
/*
* Lyra2 kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
* Copyright (c) 2014 djm34
*
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
#define ROTL64(x,n) rotate(x,(ulong)n)
#define ROTR64(x,n) rotate(x,(ulong)(64-n))
#define SWAP32(x) as_ulong(as_uint2(x).s10)
#define SWAP24(x) as_ulong(as_uchar8(x).s34567012)
#define SWAP16(x) as_ulong(as_uchar8(x).s23456701)
#define G(a,b,c,d) \
do { \
a += b; d ^= a; d = SWAP32(d); \
c += d; b ^= c; b = ROTR64(b,24); \
a += b; d ^= a; d = ROTR64(d,16); \
c += d; b ^= c; b = ROTR64(b, 63); \
\
} while (0)
#define G_old(a,b,c,d) \
do { \
a += b; d ^= a; d = ROTR64(d, 32); \
c += d; b ^= c; b = ROTR64(b, 24); \
a += b; d ^= a; d = ROTR64(d, 16); \
c += d; b ^= c; b = ROTR64(b, 63); \
\
} while (0)
/*One Round of the Blake2b's compression function*/
#define round_lyra(s) \
do { \
G(s[0].x, s[1].x, s[2].x, s[3].x); \
G(s[0].y, s[1].y, s[2].y, s[3].y); \
G(s[0].z, s[1].z, s[2].z, s[3].z); \
G(s[0].w, s[1].w, s[2].w, s[3].w); \
G(s[0].x, s[1].y, s[2].z, s[3].w); \
G(s[0].y, s[1].z, s[2].w, s[3].x); \
G(s[0].z, s[1].w, s[2].x, s[3].y); \
G(s[0].w, s[1].x, s[2].y, s[3].z); \
} while(0)
void reduceDuplexf(ulong4* state ,__global ulong4* DMatrix)
{
ulong4 state1[3];
uint ps1 = 0;
uint ps2 = (memshift * 3 + memshift * 4);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 - i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state[j] ^= state1[j];
round_lyra(state);
for (int j = 0; j < 3; j++) state1[j] ^= state[j];
for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state1[j];
}
}
void reduceDuplexRowf(uint rowIn,uint rowInOut,uint rowOut,ulong4 * state, __global ulong4 * DMatrix)
{
ulong4 state1[3], state2[3];
uint ps1 = (memshift * 4 * rowIn);
uint ps2 = (memshift * 4 * rowInOut);
uint ps3 = (memshift * 4 * rowOut);
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 + i*memshift;
uint s3 = ps3 + i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2];
for (int j = 0; j < 3; j++) state1[j] += state2[j];
for (int j = 0; j < 3; j++) state[j] ^= state1[j];
round_lyra(state);
((ulong*)state2)[0] ^= ((ulong*)state)[11];
for (int j = 0; j < 11; j++)
((ulong*)state2)[j + 1] ^= ((ulong*)state)[j];
if (rowInOut != rowOut) {
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s3] ^= state[j];
}
else {
for (int j = 0; j < 3; j++)
state2[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
}
}
}
void reduceDuplexRowSetupf(uint rowIn, uint rowInOut, uint rowOut, ulong4 *state, __global ulong4* DMatrix) {
ulong4 state2[3], state1[3];
uint ps1 = (memshift * 4 * rowIn);
uint ps2 = (memshift * 4 * rowInOut);
uint ps3 = (memshift * 3 + memshift * 4 * rowOut);
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 + i*memshift;
uint s3 = ps3 - i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2];
for (int j = 0; j < 3; j++) {
ulong4 tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
round_lyra(state);
for (int j = 0; j < 3; j++) {
state1[j] ^= state[j];
(DMatrix)[j + s3] = state1[j];
}
((ulong*)state2)[0] ^= ((ulong*)state)[11];
for (int j = 0; j < 11; j++)
((ulong*)state2)[j + 1] ^= ((ulong*)state)[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
}
}

128
kernel/bmw256.cl

@ -0,0 +1,128 @@ @@ -0,0 +1,128 @@
#define shl(x, n) ((x) << (n))
#define shr(x, n) ((x) >> (n))
//#define SHR(x, n) SHR2(x, n)
//#define SHL(x, n) SHL2(x, n)
#define SPH_ROTL32(x,n) rotate(x,(uint)n)
#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19))
#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23))
#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
#define ss4(x) (shr((x), 1) ^ (x))
#define ss5(x) (shr((x), 2) ^ (x))
#define rs1(x) SPH_ROTL32((x), 3)
#define rs2(x) SPH_ROTL32((x), 7)
#define rs3(x) SPH_ROTL32((x), 13)
#define rs4(x) SPH_ROTL32((x), 16)
#define rs5(x) SPH_ROTL32((x), 19)
#define rs6(x) SPH_ROTL32((x), 23)
#define rs7(x) SPH_ROTL32((x), 27)
/* Message expansion function 1 */
static uint expand32_1(int i, uint *M32, uint *H, uint *Q)
{
return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
}
/* Message expansion function 2 */
static uint expand32_2(int i, uint *M32, uint *H, uint *Q)
{
return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
}
static void Compression256(uint *M32, uint *H)
{
int i;
uint XL32, XH32, Q[32];
Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
/* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
Q[0] = ss0(Q[0]) + H[1];
Q[1] = ss1(Q[1]) + H[2];
Q[2] = ss2(Q[2]) + H[3];
Q[3] = ss3(Q[3]) + H[4];
Q[4] = ss4(Q[4]) + H[5];
Q[5] = ss0(Q[5]) + H[6];
Q[6] = ss1(Q[6]) + H[7];
Q[7] = ss2(Q[7]) + H[8];
Q[8] = ss3(Q[8]) + H[9];
Q[9] = ss4(Q[9]) + H[10];
Q[10] = ss0(Q[10]) + H[11];
Q[11] = ss1(Q[11]) + H[12];
Q[12] = ss2(Q[12]) + H[13];
Q[13] = ss3(Q[13]) + H[14];
Q[14] = ss4(Q[14]) + H[15];
Q[15] = ss0(Q[15]) + H[0];
/* This is the Message expansion or f_1 in the documentation. */
/* It has 16 rounds. */
/* Blue Midnight Wish has two tunable security parameters. */
/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */
/* The following relation for these parameters should is satisfied: */
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */
for (i = 0; i<2; i++)
Q[i + 16] = expand32_1(i + 16, M32, H, Q);
for (i = 2; i<16; i++)
Q[i + 16] = expand32_2(i + 16, M32, H, Q);
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
/* 16 new variables that are prooduced in the Message Expansion part. */
XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
/* This part is the function f_2 - in the documentation */
/* Compute the double chaining pipe for the next message block. */
H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]);
H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]);
H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]);
H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]);
H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]);
H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]);
H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
}

132
kernel/cubehash256.cl

@ -0,0 +1,132 @@ @@ -0,0 +1,132 @@
// cubehash256
// djm34 2015 based on ccminer cubehash512
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
#define LROT(x, bits) rotate( x,(uint) bits)
#define ROTATEUPWARDS7(a) LROT(a,7)
#define ROTATEUPWARDS11(a) LROT(a,11)
#define SWAP(a,b) { uint u = a; a = b; b = u; }
inline void rrounds(uint x[2][2][2][2][2])
{
int r;
int j;
int k;
int l;
int m;
//#pragma unroll 2
for (r = 0; r < CUBEHASH_ROUNDS; ++r) {
/* "add x_0jklm into x_1jklmn modulo 2^32" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
/* "rotate x_0jklm upwards by 7 bits" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
/* "swap x_00klm with x_01klm" */
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
/* "xor x_1jklm into x_0jklm" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jk0m with x_1jk1m" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
/* "add x_0jklm into x_1jklm modulo 2^32" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
/* "rotate x_0jklm upwards by 11 bits" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
/* "swap x_0j0lm with x_0j1lm" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
/* "xor x_1jklm into x_0jklm" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jkl0 with x_1jkl1" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
}
}

16
kernel/skein256.cl

@ -48,14 +48,14 @@ __constant static const sph_u64 SKEIN_IV512_256[8] = { @@ -48,14 +48,14 @@ __constant static const sph_u64 SKEIN_IV512_256[8] = {
__constant static const int ROT256[8][4] =
{
46, 36, 19, 37,
33, 27, 14, 42,
17, 49, 36, 39,
44, 9, 54, 56,
39, 30, 34, 24,
13, 50, 10, 17,
25, 29, 39, 43,
8, 35, 56, 22,
{46, 36, 19, 37},
{33, 27, 14, 42},
{17, 49, 36, 39},
{44, 9, 54, 56},
{39, 30, 34, 24},
{13, 50, 10, 17},
{25, 29, 39, 43},
{8, 35, 56, 22}
};
__constant static const sph_u64 skein_ks_parity = 0x1BD11BDAA9FC1A22;

5
miner.h

@ -1156,8 +1156,8 @@ extern struct pool *add_pool(void); @@ -1156,8 +1156,8 @@ extern struct pool *add_pool(void);
extern bool add_pool_details(struct pool *pool, bool live, char *url, char *user, char *pass, char *name, char *desc, char *profile, char *algo);
#define MAX_GPUDEVICES 16
#define MAX_DEVICES 4096
//#define MAX_DEVICES 4096
#define MAX_DEVICES 8192
#define MIN_INTENSITY 4
#define MIN_INTENSITY_STR "4"
#define MAX_INTENSITY 31
@ -1267,6 +1267,7 @@ struct stratum_work { @@ -1267,6 +1267,7 @@ struct stratum_work {
size_t cb_len;
size_t header_len;
int merkles;
double next_diff;
double diff;
};

134
ocl.c

@ -37,6 +37,7 @@ @@ -37,6 +37,7 @@
#include "algorithm/neoscrypt.h"
#include "algorithm/pluck.h"
#include "algorithm/yescrypt.h"
#include "algorithm/Lyra2RE.h"
/* FIXME: only here for global config vars, replace with configuration.h
* or similar as soon as config is in a struct instead of littered all
@ -599,6 +600,88 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -599,6 +600,88 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
}
else if ( !safe_cmp(cgpu->algorithm.name, "lyra2REv2") ) {
size_t glob_thread_count;
long max_int;
unsigned char type = 0;
// determine which intensity type to use
if (cgpu->rawintensity > 0) {
glob_thread_count = cgpu->rawintensity;
max_int = glob_thread_count;
type = 2;
}
else if (cgpu->xintensity > 0) {
glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity);
max_int = cgpu->xintensity;
type = 1;
}
else {
glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity);
max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
}
glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count);
// if TC * scratchbuf size is too big for memory... reduce to max
if ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) {
/* Selected intensity will not run on this GPU. Not enough memory.
* Adapt the memory setting. */
// depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc
switch (type) {
//raw intensity
case 2:
while ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) {
--glob_thread_count;
}
max_int = glob_thread_count;
cgpu->rawintensity = glob_thread_count;
break;
//x intensity
case 1:
glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE;
max_int = glob_thread_count / clState->compute_shaders;
while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) {
--max_int;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_XINTENSITY) {
applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu);
max_int = MIN_XINTENSITY;
}
cgpu->xintensity = max_int;
glob_thread_count = clState->compute_shaders * (1UL << max_int);
break;
default:
glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE;
while (max_int && ((1UL << max_int) & glob_thread_count) == 0) {
--max_int;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_INTENSITY) {
applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
max_int = MIN_INTENSITY;
}
cgpu->intensity = max_int;
glob_thread_count = 1UL << max_int;
break;
}
}
// TC is glob thread count
cgpu->thread_concurrency = glob_thread_count;
applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
} else if (!cgpu->opt_tc) {
unsigned int sixtyfours;
@ -689,12 +772,17 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -689,12 +772,17 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
if (clState->n_extra_kernels > 0) {
unsigned int i;
char kernel_name[9]; // max: search99 + 0x0
char kernel_name2[10]; // max: search99 + 0x0
clState->extra_kernels = (cl_kernel *)malloc(sizeof(cl_kernel) * clState->n_extra_kernels);
for (i = 0; i < clState->n_extra_kernels; i++) {
snprintf(kernel_name, 9, "%s%d", "search", i + 1);
clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name, &status);
if (i+1<100){
snprintf(kernel_name, 9, "%s%d", "search", i + 1);
clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name, &status);
}else {
snprintf(kernel_name2, 10, "%s%d", "search", i + 1);
clState->extra_kernels[i] = clCreateKernel(clState->program, kernel_name2, &status);}
if (status != CL_SUCCESS) {
applog(LOG_DEBUG, "Error %d: Creating ExtraKernel #%d from program. (clCreateKernel)", status, i);
return NULL;
@ -739,6 +827,21 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -739,6 +827,21 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
}
else if (!safe_cmp(algorithm->name, "lyra2REv2") ) {
/* The scratch/pad-buffer needs 32kBytes memory per thread. */
bufsize = 4 * 4 * 12 * sizeof(unsigned long long) * cgpu->thread_concurrency;
buf1size = 4 * sizeof(unsigned long long) * cgpu->thread_concurrency; //matrix
/* This is the input buffer. For yescrypt this is guaranteed to be
* 80 bytes only. */
readbufsize = 80;
applog(LOG_DEBUG, "lyra2REv2 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)buf1size);
// scrypt/n-scrypt
}
else if (!safe_cmp(algorithm->name, "pluck")) {
/* The scratch/pad-buffer needs 32kBytes memory per thread. */
@ -798,7 +901,25 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -798,7 +901,25 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
}
}
}
else if (!safe_cmp(algorithm->name, "lyra2REv2") ) {
// need additionnal buffers
clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
if (status != CL_SUCCESS && !clState->buffer1) {
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
return NULL;
}
}
else {
clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); // we don't need that much just tired...
if (status != CL_SUCCESS && !clState->buffer1) {
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
return NULL;
}
}
/* This buffer is weird and might work to some degree even if
* the create buffer call has apparently failed, so check if we
@ -808,6 +929,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -808,6 +929,13 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status);
return NULL;
}
// clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize/8, NULL, &status); // we don't need that much just tired...
// if (status != CL_SUCCESS && !clState->buffer1) {
// applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
// return NULL;
// }
}
applog(LOG_DEBUG, "Using read buffer sized %lu", (unsigned long)readbufsize);

13
sgminer.c

@ -2265,7 +2265,12 @@ static bool gbt_decode(struct pool *pool, json_t *res_val) @@ -2265,7 +2265,12 @@ static bool gbt_decode(struct pool *pool, json_t *res_val)
static bool getwork_decode(json_t *res_val, struct work *work)
{
if (unlikely(!jobj_binary(res_val, "data", work->data, sizeof(work->data), true))) {
size_t worklen = 128;
worklen = ((!safe_cmp(work->pool->algorithm.name, "credits")) ? sizeof(work->data) : worklen);
if (unlikely(!jobj_binary(res_val, "data", work->data, worklen, true))) {
if (opt_morenotices)
applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), ""));
return false;
@ -3030,7 +3035,11 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s @@ -3030,7 +3035,11 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s
}
/* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */
hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data)));
int worksize_default = 128;
hexstr = bin2hex(work->data, (!safe_cmp(work->pool->algorithm.name, "neoscrypt") ? 80 : worksize_default));
hexstr = bin2hex(work->data, (!safe_cmp(work->pool->algorithm.name, "credits") ? sizeof(work->data) : worksize_default));
/* build JSON-RPC request */
if (work->gbt) {

8
util.c

@ -1560,6 +1560,8 @@ static bool parse_notify(struct pool *pool, json_t *val) @@ -1560,6 +1560,8 @@ static bool parse_notify(struct pool *pool, json_t *val)
pool->swork.nbit = nbit;
pool->swork.ntime = ntime;
pool->swork.clean = clean;
pool->swork.diff = pool->swork.next_diff;
alloc_len = pool->swork.cb_len = cb1_len + pool->n1_len + pool->n2size + cb2_len;
pool->nonce2_offset = cb1_len + pool->n1_len;
@ -1668,8 +1670,8 @@ static bool parse_diff(struct pool *pool, json_t *val) @@ -1668,8 +1670,8 @@ static bool parse_diff(struct pool *pool, json_t *val)
return false;
cg_wlock(&pool->data_lock);
old_diff = pool->swork.diff;
pool->swork.diff = diff;
old_diff = pool->swork.next_diff;
pool->swork.next_diff = diff;
cg_wunlock(&pool->data_lock);
if (old_diff != diff) {
@ -2560,7 +2562,7 @@ out: @@ -2560,7 +2562,7 @@ out:
if (!pool->stratum_url)
pool->stratum_url = pool->sockaddr_url;
pool->stratum_active = true;
pool->swork.diff = 1;
pool->swork.next_diff = pool->swork.diff = 1;
if (opt_protocol) {
applog(LOG_DEBUG, "%s confirmed mining.subscribe with extranonce1 %s extran2size %d",
get_pool_name(pool), pool->nonce1, pool->n2size);

Loading…
Cancel
Save