Browse Source

Get some algos from 'master' of https://github.com/djm34/sgminer

from-djm34
elbandi 9 years ago
parent
commit
580676affb
  1. 3
      Makefile.am
  2. 206
      algorithm.c
  3. 6
      algorithm.h
  4. 148
      algorithm/credits.c
  5. 10
      algorithm/credits.h
  6. 22
      algorithm/lyra2.c
  7. 8
      algorithm/lyra2.h
  8. 208
      algorithm/lyra2_old.c
  9. 50
      algorithm/lyra2_old.h
  10. 25
      algorithm/lyra2re.c
  11. 2
      algorithm/lyra2re.h
  12. 169
      algorithm/lyra2re_old.c
  13. 10
      algorithm/lyra2re_old.h
  14. 2
      algorithm/pluck.h
  15. 22
      algorithm/sponge.c
  16. 8
      algorithm/sponge.h
  17. 405
      algorithm/sponge_old.c
  18. 98
      algorithm/sponge_old.h
  19. 140
      algorithm/sysendian.h
  20. 1364
      algorithm/yescrypt-opt.c
  21. 128
      algorithm/yescrypt.c
  22. 10
      algorithm/yescrypt.h
  23. 376
      algorithm/yescrypt_core.h
  24. 360
      algorithm/yescryptcommon.c
  25. 13
      driver-opencl.c
  26. 6
      findnonce.c
  27. 162
      kernel/bmw256.cl
  28. 232
      kernel/credits.cl
  29. 132
      kernel/cubehash256.cl
  30. 525
      kernel/lyra2rev2.cl
  31. 184
      kernel/lyra2v2.cl
  32. 314
      kernel/yescrypt-multi.cl
  33. 253
      kernel/yescrypt.cl
  34. 760
      kernel/yescrypt_essential.cl
  35. 27
      miner.h
  36. 242
      ocl.c
  37. 5
      ocl.h
  38. 26
      sgminer.c
  39. 2
      sph/Makefile.am
  40. 418
      sph/sha256_Y.c
  41. 63
      sph/sha256_Y.h

3
Makefile.am

@ -73,7 +73,10 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h @@ -73,7 +73,10 @@ sgminer_SOURCES += algorithm/whirlcoin.c algorithm/whirlcoin.h
sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h
sgminer_SOURCES += algorithm/whirlpoolx.c algorithm/whirlpoolx.h
sgminer_SOURCES += algorithm/lyra2re.c algorithm/lyra2re.h algorithm/lyra2.c algorithm/lyra2.h algorithm/sponge.c algorithm/sponge.h
sgminer_SOURCES += algorithm/lyra2re_old.c algorithm/lyra2re_old.h algorithm/lyra2_old.c algorithm/lyra2_old.h algorithm/sponge_old.c algorithm/sponge_old.h
sgminer_SOURCES += algorithm/pluck.c algorithm/pluck.h
sgminer_SOURCES += algorithm/credits.c algorithm/credits.h
sgminer_SOURCES += algorithm/yescrypt.h algorithm/yescrypt.c algorithm/yescrypt_core.h algorithm/yescrypt-opt.c algorithm/yescryptcommon.c algorithm/sysendian.h
bin_SCRIPTS = $(top_srcdir)/kernel/*.cl

206
algorithm.c

@ -33,7 +33,10 @@ @@ -33,7 +33,10 @@
#include "algorithm/neoscrypt.h"
#include "algorithm/whirlpoolx.h"
#include "algorithm/lyra2re.h"
#include "algorithm/lyra2re_old.h"
#include "algorithm/pluck.h"
#include "algorithm/yescrypt.h"
#include "algorithm/credits.h"
#include "compat.h"
@ -42,6 +45,7 @@ @@ -42,6 +45,7 @@
const char *algorithm_type_str[] = {
"Unknown",
"Credits",
"Scrypt",
"NScrypt",
"X11",
@ -58,7 +62,10 @@ const char *algorithm_type_str[] = { @@ -58,7 +62,10 @@ const char *algorithm_type_str[] = {
"Neoscrypt",
"WhirlpoolX",
"Lyra2RE",
"Lyra2REv2"
"Pluck"
"Yescrypt",
"Yescrypt-multi"
};
void sha256(const unsigned char *message, unsigned int len, unsigned char *digest)
@ -184,6 +191,125 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb @@ -184,6 +191,125 @@ static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __mayb
return status;
}
static cl_int queue_credits_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel = &clState->kernel;
unsigned int num = 0;
cl_ulong le_target;
cl_int status = 0;
// le_target = (*(cl_uint *)(blk->work->device_target + 24));
le_target = (cl_ulong)le64toh(((uint64_t *)blk->work->/*device_*/target)[3]);
// le_target = (cl_uint)((uint32_t *)blk->work->target)[6];
memcpy(clState->cldata, blk->work->data, 168);
// flip168(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 168, clState->cldata, 0, NULL, NULL);
CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(le_target);
CL_SET_ARG(blk->work->midstate);
return status;
}
static cl_int queue_yescrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel = &clState->kernel;
unsigned int num = 0;
cl_uint le_target;
cl_int status = 0;
// le_target = (*(cl_uint *)(blk->work->device_target + 28));
le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
// le_target = (cl_uint)((uint32_t *)blk->work->target)[7];
// memcpy(clState->cldata, blk->work->data, 80);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->buffer1);
CL_SET_ARG(clState->buffer2);
CL_SET_ARG(le_target);
return status;
}
static cl_int queue_yescrypt_multikernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
// cl_kernel *kernel = &clState->kernel;
cl_kernel *kernel;
unsigned int num = 0;
cl_uint le_target;
cl_int status = 0;
// le_target = (*(cl_uint *)(blk->work->device_target + 28));
le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
memcpy(clState->cldata, blk->work->data, 80);
// flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
//pbkdf and initial sha
kernel = &clState->kernel;
CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->buffer1);
CL_SET_ARG(clState->buffer2);
CL_SET_ARG(clState->buffer3);
CL_SET_ARG(le_target);
//inactive kernel
num = 0;
kernel = clState->extra_kernels;
CL_SET_ARG_N(0,clState->buffer1);
CL_SET_ARG_N(1,clState->buffer2);
// CL_SET_ARG_N(3, clState->buffer3);
//mix2_2
num = 0;
CL_NEXTKERNEL_SET_ARG_N(0, clState->padbuffer8);
CL_SET_ARG_N(1,clState->buffer1);
CL_SET_ARG_N(2,clState->buffer2);
//mix2_2
//inactive kernel
num = 0;
CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1);
CL_SET_ARG_N(1, clState->buffer2);
//mix2_2
num = 0;
CL_NEXTKERNEL_SET_ARG_N(0, clState->padbuffer8);
CL_SET_ARG_N(1, clState->buffer1);
CL_SET_ARG_N(2, clState->buffer2);
//inactive kernel
num = 0;
CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1);
CL_SET_ARG_N(1, clState->buffer2);
//mix2_2
//pbkdf and finalization
num=0;
CL_NEXTKERNEL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(clState->buffer2);
CL_SET_ARG(clState->buffer3);
CL_SET_ARG(le_target);
return status;
}
static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel = &clState->kernel;
@ -716,6 +842,60 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct @@ -716,6 +842,60 @@ static cl_int queue_lyra2RE_kernel(struct __clState *clState, struct _dev_blk_ct
return status;
}
static cl_int queue_lyra2REv2_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel;
unsigned int num;
cl_int status = 0;
cl_ulong le_target;
// le_target = *(cl_uint *)(blk->work->device_target + 28);
le_target = *(cl_ulong *)(blk->work->device_target + 24);
flip80(clState->cldata, blk->work->data);
status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
// blake - search
kernel = &clState->kernel;
num = 0;
// CL_SET_ARG(clState->CLbuffer0);
CL_SET_ARG(clState->buffer1);
CL_SET_ARG(blk->work->blk.ctx_a);
CL_SET_ARG(blk->work->blk.ctx_b);
CL_SET_ARG(blk->work->blk.ctx_c);
CL_SET_ARG(blk->work->blk.ctx_d);
CL_SET_ARG(blk->work->blk.ctx_e);
CL_SET_ARG(blk->work->blk.ctx_f);
CL_SET_ARG(blk->work->blk.ctx_g);
CL_SET_ARG(blk->work->blk.ctx_h);
CL_SET_ARG(blk->work->blk.cty_a);
CL_SET_ARG(blk->work->blk.cty_b);
CL_SET_ARG(blk->work->blk.cty_c);
// keccak - search1
kernel = clState->extra_kernels;
CL_SET_ARG_0(clState->buffer1);
// cubehash - search2
num = 0;
CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
// lyra - search3
num = 0;
CL_NEXTKERNEL_SET_ARG_N(0, clState->buffer1);
CL_SET_ARG_N(1, clState->padbuffer8);
// skein -search4
num = 0;
CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
// cubehash - search5
num = 0;
CL_NEXTKERNEL_SET_ARG_0(clState->buffer1);
// bmw - search6
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->buffer1);
CL_SET_ARG(clState->outputBuffer);
CL_SET_ARG(le_target);
return status;
}
static cl_int queue_pluck_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_kernel *kernel = &clState->kernel;
@ -757,6 +937,25 @@ static algorithm_settings_t algos[] = { @@ -757,6 +937,25 @@ static algorithm_settings_t algos[] = {
{ a, ALGO_PLUCK, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, pluck_regenhash, queue_pluck_kernel, gen_hash, append_neoscrypt_compiler_options }
A_PLUCK("pluck"),
#undef A_PLUCK
#define A_CREDITS(a) \
{ a, ALGO_CRE, "", 1, 1, 1, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, credits_regenhash, queue_credits_kernel, gen_hash, NULL}
A_CREDITS("credits"),
#undef A_CREDITS
#define A_YESCRYPT(a) \
{ a, ALGO_YESCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, yescrypt_regenhash, queue_yescrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
A_YESCRYPT("yescrypt"),
#undef A_YESCRYPT
#define A_YESCRYPT_MULTI(a) \
{ a, ALGO_YESCRYPT_MULTI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 6,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, queue_yescrypt_multikernel, gen_hash, append_neoscrypt_compiler_options}
A_YESCRYPT_MULTI("yescrypt-multi"),
#undef A_YESCRYPT_MULTI
// kernels starting from this will have difficulty calculated by using quarkcoin algorithm
#define A_QUARK(a, b) \
{ a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
@ -793,7 +992,10 @@ static algorithm_settings_t algos[] = { @@ -793,7 +992,10 @@ static algorithm_settings_t algos[] = {
{ "fresh", ALGO_FRESH, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 4 * 16 * 4194304, 0, fresh_regenhash, queue_fresh_kernel, gen_hash, NULL },
{ "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2re_regenhash, queue_lyra2RE_kernel, gen_hash, NULL },
{ "lyra2re", ALGO_LYRA2RE, "", 1, 128, 128, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 4, 2 * 8 * 4194304, 0, lyra2reold_regenhash, queue_lyra2RE_kernel, gen_hash, NULL },
{ "lyra2rev2", ALGO_LYRA2REv2, "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 6, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, lyra2re_regenhash, queue_lyra2REv2_kernel, gen_hash, append_neoscrypt_compiler_options },
// kernels starting from this will have difficulty calculated by using fuguecoin algorithm
#define A_FUGUE(a, b, c) \
@ -877,8 +1079,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa @@ -877,8 +1079,8 @@ static const char *lookup_algorithm_alias(const char *lookup_alias, uint8_t *nfa
ALGO_ALIAS("nist5", "talkcoin-mod");
ALGO_ALIAS("keccak", "maxcoin");
ALGO_ALIAS("whirlpool", "whirlcoin");
ALGO_ALIAS("Lyra2RE", "lyra2re");
ALGO_ALIAS("lyra2", "lyra2re");
ALGO_ALIAS("lyra2v2", "lyra2rev2");
#undef ALGO_ALIAS
#undef ALGO_ALIAS_NF

6
algorithm.h

@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
typedef enum {
ALGO_UNK,
ALGO_CRE,
ALGO_SCRYPT,
ALGO_NSCRYPT,
ALGO_X11,
@ -29,7 +30,10 @@ typedef enum { @@ -29,7 +30,10 @@ typedef enum {
ALGO_NEOSCRYPT,
ALGO_WHIRLPOOLX,
ALGO_LYRA2RE,
ALGO_PLUCK
ALGO_LYRA2REv2,
ALGO_PLUCK,
ALGO_YESCRYPT,
ALGO_YESCRYPT_MULTI,
} algorithm_type_t;
extern const char *algorithm_type_str[];

148
algorithm/credits.c

@ -0,0 +1,148 @@ @@ -0,0 +1,148 @@
/*-
* Copyright 2015 djm34
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "config.h"
#include "miner.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "sph/sph_sha2.h"
static const uint32_t diff1targ = 0x0000ffff;
inline void credits_hash(void *state, const void *input)
{
sph_sha256_context sha1, sha2;
uint32_t hash[8], hash2[8];
sph_sha256_init(&sha1);
sph_sha256(&sha1, input, 168);
sph_sha256_close(&sha1, hash);
sph_sha256_init(&sha2);
sph_sha256(&sha2, hash, 32);
sph_sha256_close(&sha2, hash2);
memcpy(state, hash2, 32);
}
static inline void
be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
{
uint32_t i;
for (i = 0; i < len; i++)
dst[i] = htobe32(src[i]);
}
/* Used externally as confirmation of correct OCL code */
int credits_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
{
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
uint32_t data[42], ohash[8];
printf("coming here credits test\n");
be32enc_vect(data, (const uint32_t *)pdata, 42);
data[35] = htobe32(nonce);
credits_hash((unsigned char*)data,(unsigned char*)ohash);
tmp_hash7 = be32toh(ohash[7]);
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
(long unsigned int)Htarg,
(long unsigned int)diff1targ,
(long unsigned int)tmp_hash7);
if (tmp_hash7 > diff1targ)
return -1;
if (tmp_hash7 > Htarg)
return 0;
return 1;
}
void credits_regenhash(struct work *work)
{
uint32_t data[42];
uint32_t *nonce = (uint32_t *)(work->data + 140);
uint32_t *ohash = (uint32_t *)(work->hash);
be32enc_vect(data, (const uint32_t *)work->data, 42);
data[35] = htobe32(*nonce);
credits_hash((unsigned char*)ohash, (unsigned char*)data);
}
bool scanhash_credits(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *pdata, unsigned char __maybe_unused *phash1,
unsigned char __maybe_unused *phash, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
{
uint32_t *nonce = (uint32_t *)(pdata + 140);
uint32_t data[42];
uint32_t tmp_hash7;
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
bool ret = false;
be32enc_vect(data, (const uint32_t *)pdata, 35);
while (1)
{
uint32_t ostate[8];
*nonce = ++n;
data[35] = (n);
credits_hash(ostate, data);
tmp_hash7 = (ostate[7]);
applog(LOG_INFO, "data7 %08lx", (long unsigned int)ostate[7]);
if (unlikely(tmp_hash7 <= Htarg))
{
((uint32_t *)pdata)[35] = htobe32(n);
*last_nonce = n;
ret = true;
break;
}
if (unlikely((n >= max_nonce) || thr->work_restart))
{
*last_nonce = n;
break;
}
}
return ret;
}

10
algorithm/credits.h

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
#ifndef CREDITS_H
#define CREDITS_H
#include "miner.h"
extern int credits_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
extern void credits_regenhash(struct work *work);
#endif /* CREDITS_H */

22
algorithm/lyra2.c

@ -58,15 +58,19 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * @@ -58,15 +58,19 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
//========== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
uint64_t *wholeMatrix = (uint64_t*)malloc(i);
uint64_t *wholeMatrix = malloc(i);
if (wholeMatrix == NULL) {
return -1;
}
memset(wholeMatrix, 0, i);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = (uint64_t**)malloc(nRows * sizeof (uint64_t*));
uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
if (memMatrix == NULL) {
return -1;
}
@ -118,7 +122,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * @@ -118,7 +122,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
//======================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
uint64_t *state = (uint64_t*)malloc(16 * sizeof (uint64_t));
uint64_t *state = malloc(16 * sizeof (uint64_t));
if (state == NULL) {
return -1;
}
@ -130,16 +134,16 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * @@ -130,16 +134,16 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++) {
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_INT64; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1]);
reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
//updates the value of row* (deterministically picked during Setup))
@ -172,7 +176,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * @@ -172,7 +176,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
//------------------------------------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
//update prev: it now points to the last row ever computed
prev = row;
@ -192,7 +196,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void * @@ -192,7 +196,7 @@ int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *
absorbBlock(state, memMatrix[rowa]);
//Squeezes the key
squeeze(state, (unsigned char*)K, kLen);
squeeze(state, K, kLen);
//==========================================================================/
//========================= Freeing the memory =============================//

8
algorithm/lyra2.h

@ -37,14 +37,6 @@ typedef unsigned char byte; @@ -37,14 +37,6 @@ typedef unsigned char byte;
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
#endif
#ifndef N_COLS
#define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default
#endif
#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row
int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
#endif /* LYRA2_H_ */

208
algorithm/lyra2_old.c

@ -0,0 +1,208 @@ @@ -0,0 +1,208 @@
/**
* Implementation of the Lyra2 Password Hashing Scheme (PHS).
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "lyra2_old.h"
#include "sponge_old.h"
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
*
* @param K The derived key to be output by the algorithm
* @param kLen Desired key length
* @param pwd User password
* @param pwdlen Password length
* @param salt Salt
* @param saltlen Salt length
* @param timeCost Parameter to determine the processing time (T)
* @param nRows Number or rows of the memory matrix (R)
* @param nCols Number of columns of the memory matrix (C)
*
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
*/
int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) {
//============================= Basic variables ============================//
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
//==========================================================================/
//========== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
uint64_t *wholeMatrix = malloc(i);
if (wholeMatrix == NULL) {
return -1;
}
memset(wholeMatrix, 0, i);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc(nRows * sizeof (uint64_t*));
if (memMatrix == NULL) {
return -1;
}
//Places the pointers in the correct positions
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < nRows; i++) {
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
//==========================================================================/
//============= Getting the password + salt + basil padded with 10*1 ===============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
byte *ptrByte = (byte*) wholeMatrix;
memset(ptrByte, 0, nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &saltlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &timeCost, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nRows, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nCols, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//==========================================================================/
//======================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
uint64_t *state = malloc(16 * sizeof (uint64_t));
if (state == NULL) {
return -1;
}
initStateO(state);
//==========================================================================/
//================================ Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++) {
absorbBlockBlake2SafeO(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0O(state, memMatrix[0]); //The locally copied password is most likely overwritten here
reducedDuplexRow1O(state, memMatrix[0], memMatrix[1]);
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetupO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < nRows);
//==========================================================================/
//============================ Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= timeCost; tau++) {
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
do {
//Selects a pseudorandom index row*
//------------------------------------------------------------------------------------------
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRowO(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//------------------------------------------------------------------------------------------
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
} while (row != 0);
}
//==========================================================================/
//============================ Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlockO(state, memMatrix[rowa]);
//Squeezes the key
squeezeO(state, K, kLen);
//==========================================================================/
//========================= Freeing the memory =============================//
free(memMatrix);
free(wholeMatrix);
//Wiping out the sponge's internal state before freeing it
memset(state, 0, 16 * sizeof (uint64_t));
free(state);
//==========================================================================/
return 0;
}

50
algorithm/lyra2_old.h

@ -0,0 +1,50 @@ @@ -0,0 +1,50 @@
/**
* Header file for the Lyra2 Password Hashing Scheme (PHS).
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef LYRA2OLD_H_
#define LYRA2OLD_H_
#include <stdint.h>
typedef unsigned char byte;
//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t)
#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes
#ifdef BLOCK_LEN_BITS
#define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t)
#define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes
#else //default block lenght: 768 bits
#define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t)
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
#endif
#ifndef N_COLS
#define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default
#endif
#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row
int LYRA2O(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
#endif /* LYRA2_H_ */

25
algorithm/lyra2re.c

@ -36,6 +36,8 @@ @@ -36,6 +36,8 @@
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_keccak.h"
#include "sph/sph_bmw.h"
#include "sph/sph_cubehash.h"
#include "lyra2.h"
/*
@ -55,9 +57,10 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len) @@ -55,9 +57,10 @@ be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
inline void lyra2rehash(void *state, const void *input)
{
sph_blake256_context ctx_blake;
sph_groestl256_context ctx_groestl;
sph_bmw256_context ctx_bmw;
sph_keccak256_context ctx_keccak;
sph_skein256_context ctx_skein;
sph_cubehash256_context ctx_cube;
uint32_t hashA[8], hashB[8];
@ -72,17 +75,23 @@ inline void lyra2rehash(void *state, const void *input) @@ -72,17 +75,23 @@ inline void lyra2rehash(void *state, const void *input)
sph_keccak256 (&ctx_keccak,hashA, 32);
sph_keccak256_close(&ctx_keccak, hashB);
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_cubehash256_init(&ctx_cube);
sph_cubehash256(&ctx_cube, hashB, 32);
sph_cubehash256_close(&ctx_cube, hashA);
LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
sph_skein256_init(&ctx_skein);
sph_skein256 (&ctx_skein, hashA, 32);
sph_skein256_close(&ctx_skein, hashB);
sph_skein256_init(&ctx_skein);
sph_skein256 (&ctx_skein, hashB, 32);
sph_skein256_close(&ctx_skein, hashA);
sph_cubehash256_init(&ctx_cube);
sph_cubehash256(&ctx_cube, hashA, 32);
sph_cubehash256_close(&ctx_cube, hashB);
sph_groestl256_init(&ctx_groestl);
sph_groestl256 (&ctx_groestl, hashB, 32);
sph_groestl256_close(&ctx_groestl, hashA);
sph_bmw256_init(&ctx_bmw);
sph_bmw256 (&ctx_bmw, hashB, 32);
sph_bmw256_close(&ctx_bmw, hashA);
//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]);

2
algorithm/lyra2re.h

@ -2,6 +2,8 @@ @@ -2,6 +2,8 @@
#define LYRA2RE_H
#include "miner.h"
#define LYRA_SCRATCHBUF_SIZE (1536) // matrix size [12][4][4] uint64_t or equivalent
#define LYRA_SECBUF_SIZE (4) // (not used)
extern int lyra2re_test(unsigned char *pdata, const unsigned char *ptarget,
uint32_t nonce);

169
algorithm/lyra2re_old.c

@ -0,0 +1,169 @@ @@ -0,0 +1,169 @@
/*-
* Copyright 2014 James Lovejoy
* Copyright 2014 phm
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "config.h"
#include "miner.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "sph/sph_blake.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_keccak.h"
#include "lyra2_old.h"
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static inline void
be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
{
uint32_t i;
for (i = 0; i < len; i++)
dst[i] = htobe32(src[i]);
}
inline void lyra2rehash_old(void *state, const void *input)
{
sph_blake256_context ctx_blake;
sph_groestl256_context ctx_groestl;
sph_keccak256_context ctx_keccak;
sph_skein256_context ctx_skein;
uint32_t hashA[8], hashB[8];
sph_blake256_init(&ctx_blake);
sph_blake256 (&ctx_blake, input, 80);
sph_blake256_close (&ctx_blake, hashA);
sph_keccak256_init(&ctx_keccak);
sph_keccak256 (&ctx_keccak,hashA, 32);
sph_keccak256_close(&ctx_keccak, hashB);
LYRA2O(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_skein256_init(&ctx_skein);
sph_skein256 (&ctx_skein, hashA, 32);
sph_skein256_close(&ctx_skein, hashB);
sph_groestl256_init(&ctx_groestl);
sph_groestl256 (&ctx_groestl, hashB, 32);
sph_groestl256_close(&ctx_groestl, hashA);
//printf("cpu hash %08x %08x %08x %08x\n",hashA[0],hashA[1],hashA[2],hashA[3]);
memcpy(state, hashA, 32);
}
static const uint32_t diff1targ = 0x0000ffff;
/* Used externally as confirmation of correct OCL code */
int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
{
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
uint32_t data[20], ohash[8];
be32enc_vect(data, (const uint32_t *)pdata, 19);
data[19] = htobe32(nonce);
lyra2rehash_old(ohash, data);
tmp_hash7 = be32toh(ohash[7]);
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
(long unsigned int)Htarg,
(long unsigned int)diff1targ,
(long unsigned int)tmp_hash7);
if (tmp_hash7 > diff1targ)
return -1;
if (tmp_hash7 > Htarg)
return 0;
return 1;
}
void lyra2reold_regenhash(struct work *work)
{
uint32_t data[20];
uint32_t *nonce = (uint32_t *)(work->data + 76);
uint32_t *ohash = (uint32_t *)(work->hash);
be32enc_vect(data, (const uint32_t *)work->data, 19);
data[19] = htobe32(*nonce);
lyra2rehash_old(ohash, data);
}
bool scanhash_lyra2reold(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *pdata, unsigned char __maybe_unused *phash1,
unsigned char __maybe_unused *phash, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
{
uint32_t *nonce = (uint32_t *)(pdata + 76);
uint32_t data[20];
uint32_t tmp_hash7;
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
bool ret = false;
be32enc_vect(data, (const uint32_t *)pdata, 19);
while(1) {
uint32_t ostate[8];
*nonce = ++n;
data[19] = (n);
lyra2rehash_old(ostate, data);
tmp_hash7 = (ostate[7]);
applog(LOG_INFO, "data7 %08lx",
(long unsigned int)data[7]);
if (unlikely(tmp_hash7 <= Htarg)) {
((uint32_t *)pdata)[19] = htobe32(n);
*last_nonce = n;
ret = true;
break;
}
if (unlikely((n >= max_nonce) || thr->work_restart)) {
*last_nonce = n;
break;
}
}
return ret;
}

10
algorithm/lyra2re_old.h

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
#ifndef LYRA2REOLD_H
#define LYRA2REOLD_H
#include "miner.h"
extern int lyra2reold_test(unsigned char *pdata, const unsigned char *ptarget,
uint32_t nonce);
extern void lyra2reold_regenhash(struct work *work);
#endif /* LYRA2RE_H */

2
algorithm/pluck.h

@ -3,6 +3,8 @@ @@ -3,6 +3,8 @@
#include "miner.h"
#define PLUCK_SCRATCHBUF_SIZE (128 * 1024)
#define PLUCK_SECBUF_SIZE (64 * 1024)
extern int pluck_test(unsigned char *pdata, const unsigned char *ptarget,
uint32_t nonce);
extern void pluck_regenhash(struct work *work);

22
algorithm/sponge.c

@ -158,11 +158,11 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { @@ -158,11 +158,11 @@ void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
* @param state The current state of the sponge
* @param rowOut Row to receive the data squeezed
*/
void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut, uint64_t nCols) {
uint64_t* ptrWord = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
int i;
//M[row][C-1-col] = H.reduced_squeeze()
for (i = 0; i < N_COLS; i++) {
for (i = 0; i < nCols; i++) {
ptrWord[0] = state[0];
ptrWord[1] = state[1];
ptrWord[2] = state[2];
@ -193,12 +193,12 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { @@ -193,12 +193,12 @@ void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
* @param rowIn Row to feed the sponge
* @param rowOut Row to receive the sponge's output
*/
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
for (i = 0; i < nCols; i++) {
//Absorbing "M[prev][col]"
state[0] ^= (ptrWordIn[0]);
@ -253,13 +253,13 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { @@ -253,13 +253,13 @@ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
* @param rowOut Row receiving the output
*
*/
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
for (i = 0; i < nCols; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
@ -327,13 +327,13 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, @@ -327,13 +327,13 @@ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut,
* @param rowOut Row receiving the output
*
*/
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols) {
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
for (i = 0; i < nCols; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);

8
algorithm/sponge.h

@ -78,16 +78,16 @@ void initState(uint64_t state[/*16*/]); @@ -78,16 +78,16 @@ void initState(uint64_t state[/*16*/]);
//---- Squeezes
void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
void reducedSqueezeRow0(uint64_t* state, uint64_t* row);
void reducedSqueezeRow0(uint64_t* state, uint64_t* row, uint64_t nCols);
//---- Absorbs
void absorbBlock(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
//---- Duplexes
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, uint64_t nCols);
//---- Misc
void printArray(unsigned char *array, unsigned int size, char *name);

405
algorithm/sponge_old.c

@ -0,0 +1,405 @@ @@ -0,0 +1,405 @@
/**
* A simple implementation of Blake2b's internal permutation
* in the form of a sponge.
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <string.h>
#include <stdio.h>
#include <time.h>
#include "sponge_old.h"
#include "lyra2_old.h"
/**
* Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
* receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
* typically have their internal state initialized with zeros, Blake2b's G function
* has a fixed point: if the internal state and message are both filled with zeros. the
* resulting permutation will always be a block filled with zeros; this happens because
* Blake2b does not use the constants originally employed in Blake2 inside its G function,
* relying on the IV for avoiding possible fixed points.
*
* @param state The 1024-bit array to be initialized
*/
void initStateO(uint64_t state[/*16*/]) {
//First 512 bis are zeros
memset(state, 0, 64);
//Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
state[8] = blake2b_IV[0];
state[9] = blake2b_IV[1];
state[10] = blake2b_IV[2];
state[11] = blake2b_IV[3];
state[12] = blake2b_IV[4];
state[13] = blake2b_IV[5];
state[14] = blake2b_IV[6];
state[15] = blake2b_IV[7];
}
/**
* Execute Blake2b's G function, with all 12 rounds.
*
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
*/
static void blake2bLyra(uint64_t *v) {
ROUND_LYRA(0);
ROUND_LYRA(1);
ROUND_LYRA(2);
ROUND_LYRA(3);
ROUND_LYRA(4);
ROUND_LYRA(5);
ROUND_LYRA(6);
ROUND_LYRA(7);
ROUND_LYRA(8);
ROUND_LYRA(9);
ROUND_LYRA(10);
ROUND_LYRA(11);
}
/**
* Executes a reduced version of Blake2b's G function with only one round
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
*/
static void reducedBlake2bLyra(uint64_t *v) {
ROUND_LYRA(0);
}
/**
* Performs a squeeze operation, using Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param out Array that will receive the data squeezed
* @param len The number of bytes to be squeezed into the "out" array
*/
void squeezeO(uint64_t *state, byte *out, unsigned int len) {
int fullBlocks = len / BLOCK_LEN_BYTES;
byte *ptr = out;
int i;
//Squeezes full blocks
for (i = 0; i < fullBlocks; i++) {
memcpy(ptr, state, BLOCK_LEN_BYTES);
blake2bLyra(state);
ptr += BLOCK_LEN_BYTES;
}
//Squeezes remaining bytes
memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
}
/**
* Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
* of type uint64_t), using Blake2b's G function as the internal permutation
*
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_INT64 words)
*/
void absorbBlockO(uint64_t *state, const uint64_t *in) {
//XORs the first BLOCK_LEN_INT64 words of "in" with the current state
state[0] ^= in[0];
state[1] ^= in[1];
state[2] ^= in[2];
state[3] ^= in[3];
state[4] ^= in[4];
state[5] ^= in[5];
state[6] ^= in[6];
state[7] ^= in[7];
state[8] ^= in[8];
state[9] ^= in[9];
state[10] ^= in[10];
state[11] ^= in[11];
//Applies the transformation f to the sponge's state
blake2bLyra(state);
}
/**
* Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64
* words of type uint64_t), using Blake2b's G function as the internal permutation
*
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
*/
void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in) {
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
state[0] ^= in[0];
state[1] ^= in[1];
state[2] ^= in[2];
state[3] ^= in[3];
state[4] ^= in[4];
state[5] ^= in[5];
state[6] ^= in[6];
state[7] ^= in[7];
//Applies the transformation f to the sponge's state
blake2bLyra(state);
}
/**
* Performs a reduced squeeze operation for a single row, from the highest to
* the lowest index, using the reduced-round Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param rowOut Row to receive the data squeezed
*/
void reducedSqueezeRow0O(uint64_t* state, uint64_t* rowOut) {
uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
int i;
//M[row][C-1-col] = H.reduced_squeeze()
for (i = 0; i < N_COLS; i++) {
ptrWord[0] = state[0];
ptrWord[1] = state[1];
ptrWord[2] = state[2];
ptrWord[3] = state[3];
ptrWord[4] = state[4];
ptrWord[5] = state[5];
ptrWord[6] = state[6];
ptrWord[7] = state[7];
ptrWord[8] = state[8];
ptrWord[9] = state[9];
ptrWord[10] = state[10];
ptrWord[11] = state[11];
//Goes to next block (column) that will receive the squeezed data
ptrWord -= BLOCK_LEN_INT64;
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
}
}
/**
* Performs a reduced duplex operation for a single row, from the highest to
* the lowest index, using the reduced-round Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param rowIn Row to feed the sponge
* @param rowOut Row to receive the sponge's output
*/
void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev][col]"
state[0] ^= (ptrWordIn[0]);
state[1] ^= (ptrWordIn[1]);
state[2] ^= (ptrWordIn[2]);
state[3] ^= (ptrWordIn[3]);
state[4] ^= (ptrWordIn[4]);
state[5] ^= (ptrWordIn[5]);
state[6] ^= (ptrWordIn[6]);
state[7] ^= (ptrWordIn[7]);
state[8] ^= (ptrWordIn[8]);
state[9] ^= (ptrWordIn[9]);
state[10] ^= (ptrWordIn[10]);
state[11] ^= (ptrWordIn[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row][C-1-col] = M[prev][col] XOR rand
ptrWordOut[0] = ptrWordIn[0] ^ state[0];
ptrWordOut[1] = ptrWordIn[1] ^ state[1];
ptrWordOut[2] = ptrWordIn[2] ^ state[2];
ptrWordOut[3] = ptrWordIn[3] ^ state[3];
ptrWordOut[4] = ptrWordIn[4] ^ state[4];
ptrWordOut[5] = ptrWordIn[5] ^ state[5];
ptrWordOut[6] = ptrWordIn[6] ^ state[6];
ptrWordOut[7] = ptrWordIn[7] ^ state[7];
ptrWordOut[8] = ptrWordIn[8] ^ state[8];
ptrWordOut[9] = ptrWordIn[9] ^ state[9];
ptrWordOut[10] = ptrWordIn[10] ^ state[10];
ptrWordOut[11] = ptrWordIn[11] ^ state[11];
//Input: next column (i.e., next block in sequence)
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
}
}
/**
* Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
* the wordwise addition of two columns, ignoring carries between words). The
* output of this operation, "rand", is then used to make
* "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and
* "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left and N_COLS is a system parameter.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row][col] = M[prev][col] XOR rand
ptrWordOut[0] = ptrWordIn[0] ^ state[0];
ptrWordOut[1] = ptrWordIn[1] ^ state[1];
ptrWordOut[2] = ptrWordIn[2] ^ state[2];
ptrWordOut[3] = ptrWordIn[3] ^ state[3];
ptrWordOut[4] = ptrWordIn[4] ^ state[4];
ptrWordOut[5] = ptrWordIn[5] ^ state[5];
ptrWordOut[6] = ptrWordIn[6] ^ state[6];
ptrWordOut[7] = ptrWordIn[7] ^ state[7];
ptrWordOut[8] = ptrWordIn[8] ^ state[8];
ptrWordOut[9] = ptrWordIn[9] ^ state[9];
ptrWordOut[10] = ptrWordIn[10] ^ state[10];
ptrWordOut[11] = ptrWordIn[11] ^ state[11];
//M[row*][col] = M[row*][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[11];
ptrWordInOut[1] ^= state[0];
ptrWordInOut[2] ^= state[1];
ptrWordInOut[3] ^= state[2];
ptrWordInOut[4] ^= state[3];
ptrWordInOut[5] ^= state[4];
ptrWordInOut[6] ^= state[5];
ptrWordInOut[7] ^= state[6];
ptrWordInOut[8] ^= state[7];
ptrWordInOut[9] ^= state[8];
ptrWordInOut[10] ^= state[9];
ptrWordInOut[11] ^= state[10];
//Inputs: next column (i.e., next block in sequence)
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
}
}
/**
* Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
* the wordwise addition of two columns, ignoring carries between words). The
* output of this operation, "rand", is then used to make
* "M[rowOut][col] = M[rowOut][col] XOR rand" and
* "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[rowOut][col] = M[rowOut][col] XOR rand
ptrWordOut[0] ^= state[0];
ptrWordOut[1] ^= state[1];
ptrWordOut[2] ^= state[2];
ptrWordOut[3] ^= state[3];
ptrWordOut[4] ^= state[4];
ptrWordOut[5] ^= state[5];
ptrWordOut[6] ^= state[6];
ptrWordOut[7] ^= state[7];
ptrWordOut[8] ^= state[8];
ptrWordOut[9] ^= state[9];
ptrWordOut[10] ^= state[10];
ptrWordOut[11] ^= state[11];
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[11];
ptrWordInOut[1] ^= state[0];
ptrWordInOut[2] ^= state[1];
ptrWordInOut[3] ^= state[2];
ptrWordInOut[4] ^= state[3];
ptrWordInOut[5] ^= state[4];
ptrWordInOut[6] ^= state[5];
ptrWordInOut[7] ^= state[6];
ptrWordInOut[8] ^= state[7];
ptrWordInOut[9] ^= state[8];
ptrWordInOut[10] ^= state[9];
ptrWordInOut[11] ^= state[10];
//Goes to next block
ptrWordOut += BLOCK_LEN_INT64;
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/**
Prints an array of unsigned chars
*/
void printArrayO(unsigned char *array, unsigned int size, char *name) {
int i;
printf("%s: ", name);
for (i = 0; i < size; i++) {
printf("%2x|", array[i]);
}
printf("\n");
}
////////////////////////////////////////////////////////////////////////////////////////////////

98
algorithm/sponge_old.h

@ -0,0 +1,98 @@ @@ -0,0 +1,98 @@
/**
* Header file for Blake2b's internal permutation in the form of a sponge.
* This code is based on the original Blake2b's implementation provided by
* Samuel Neves (https://blake2.net/)
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SPONGEOLD_H_
#define SPONGEOLD_H_
#include <stdint.h>
#if defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(32)))
#elif defined(_MSC_VER)
#define ALIGN __declspec(align(32))
#else
#define ALIGN
#endif
/*Blake2b IV Array*/
static const uint64_t blake2b_IV[8] =
{
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
/*Blake2b's rotation*/
static inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
/*Blake2b's G function*/
#define G(r,i,a,b,c,d) \
do { \
a = a + b; \
d = rotr64(d ^ a, 32); \
c = c + d; \
b = rotr64(b ^ c, 24); \
a = a + b; \
d = rotr64(d ^ a, 16); \
c = c + d; \
b = rotr64(b ^ c, 63); \
} while(0)
/*One Round of the Blake2b's compression function*/
#define ROUND_LYRA(r) \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
G(r,2,v[ 2],v[ 6],v[10],v[14]); \
G(r,3,v[ 3],v[ 7],v[11],v[15]); \
G(r,4,v[ 0],v[ 5],v[10],v[15]); \
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
//---- Housekeeping
void initStateO(uint64_t state[/*16*/]);
//---- Squeezes
void squeezeO(uint64_t *state, unsigned char *out, unsigned int len);
void reducedSqueezeRow0O(uint64_t* state, uint64_t* row);
//---- Absorbs
void absorbBlockO(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2SafeO(uint64_t *state, const uint64_t *in);
//---- Duplexes
void reducedDuplexRow1O(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
void reducedDuplexRowSetupO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
void reducedDuplexRowO(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
//---- Misc
void printArrayO(unsigned char *array, unsigned int size, char *name);
////////////////////////////////////////////////////////////////////////////////////////////////
#endif /* SPONGE_H_ */

140
algorithm/sysendian.h

@ -0,0 +1,140 @@ @@ -0,0 +1,140 @@
/*-
* Copyright 2007-2009 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#ifndef _SYSENDIAN_H_
#define _SYSENDIAN_H_
/* If we don't have be64enc, the <sys/endian.h> we have isn't usable. */
#if !HAVE_DECL_BE64ENC
#undef HAVE_SYS_ENDIAN_H
#endif
#ifdef HAVE_SYS_ENDIAN_H
#include <sys/endian.h>
#else
#include <stdint.h>
#if !HAVE_DECL_LE32DEC
static uint32_t le32dec(const void *pp)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) +
((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24));
}
#endif
#if !HAVE_DECL_BE32ENC
static void be32enc(void *pp, uint32_t x)
{
uint8_t *p = (uint8_t *)pp;
p[3] = x & 0xff;
p[2] = (x >> 8) & 0xff;
p[1] = (x >> 16) & 0xff;
p[0] = (x >> 24) & 0xff;
}
#endif
#if !HAVE_DECL_BE32DEC
static uint32_t be32dec(const void *pp)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) +
((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24));
}
#endif
#if !HAVE_DECL_LE32ENC
static void le32enc(void *pp, uint32_t x)
{
uint8_t *p = (uint8_t *)pp;
p[0] = x & 0xff;
p[1] = (x >> 8) & 0xff;
p[2] = (x >> 16) & 0xff;
p[3] = (x >> 24) & 0xff;
}
#endif
static uint64_t
be64dec(const void *pp)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint64_t)(p[7]) + ((uint64_t)(p[6]) << 8) +
((uint64_t)(p[5]) << 16) + ((uint64_t)(p[4]) << 24) +
((uint64_t)(p[3]) << 32) + ((uint64_t)(p[2]) << 40) +
((uint64_t)(p[1]) << 48) + ((uint64_t)(p[0]) << 56));
}
static void
be64enc(void *pp, uint64_t x)
{
uint8_t * p = (uint8_t *)pp;
p[7] = x & 0xff;
p[6] = (x >> 8) & 0xff;
p[5] = (x >> 16) & 0xff;
p[4] = (x >> 24) & 0xff;
p[3] = (x >> 32) & 0xff;
p[2] = (x >> 40) & 0xff;
p[1] = (x >> 48) & 0xff;
p[0] = (x >> 56) & 0xff;
}
static uint64_t
le64dec(const void *pp)
{
const uint8_t *p = (uint8_t const *)pp;
return ((uint64_t)(p[0]) + ((uint64_t)(p[1]) << 8) +
((uint64_t)(p[2]) << 16) + ((uint64_t)(p[3]) << 24) +
((uint64_t)(p[4]) << 32) + ((uint64_t)(p[5]) << 40) +
((uint64_t)(p[6]) << 48) + ((uint64_t)(p[7]) << 56));
}
static void
le64enc(void *pp, uint64_t x)
{
uint8_t * p = (uint8_t *)pp;
p[0] = x & 0xff;
p[1] = (x >> 8) & 0xff;
p[2] = (x >> 16) & 0xff;
p[3] = (x >> 24) & 0xff;
p[4] = (x >> 32) & 0xff;
p[5] = (x >> 40) & 0xff;
p[6] = (x >> 48) & 0xff;
p[7] = (x >> 56) & 0xff;
}
#endif /* !HAVE_SYS_ENDIAN_H */
#endif /* !_SYSENDIAN_H_ */

1364
algorithm/yescrypt-opt.c

File diff suppressed because it is too large Load Diff

128
algorithm/yescrypt.c

@ -0,0 +1,128 @@ @@ -0,0 +1,128 @@
/*-
* Copyright 2015 djm34
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "config.h"
#include "miner.h"
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include "algorithm/yescrypt_core.h"
static const uint32_t diff1targ = 0x0000ffff;
static inline void
be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
{
uint32_t i;
for (i = 0; i < len; i++)
dst[i] = htobe32(src[i]);
}
/* Used externally as confirmation of correct OCL code */
int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce)
{
uint32_t tmp_hash7, Htarg = le32toh(((const uint32_t *)ptarget)[7]);
uint32_t data[20], ohash[8];
be32enc_vect(data, (const uint32_t *)pdata, 19);
data[19] = htobe32(nonce);
yescrypt_hash((unsigned char*)data,(unsigned char*)ohash);
tmp_hash7 = be32toh(ohash[7]);
applog(LOG_DEBUG, "htarget %08lx diff1 %08lx hash %08lx",
(long unsigned int)Htarg,
(long unsigned int)diff1targ,
(long unsigned int)tmp_hash7);
if (tmp_hash7 > diff1targ)
return -1;
if (tmp_hash7 > Htarg)
return 0;
return 1;
}
void yescrypt_regenhash(struct work *work)
{
uint32_t data[20];
uint32_t *nonce = (uint32_t *)(work->data + 76);
uint32_t *ohash = (uint32_t *)(work->hash);
be32enc_vect(data, (const uint32_t *)work->data, 19);
data[19] = htobe32(*nonce);
yescrypt_hash((unsigned char*)data, (unsigned char*)ohash);
}
bool scanhash_yescrypt(struct thr_info *thr, const unsigned char __maybe_unused *pmidstate,
unsigned char *pdata, unsigned char __maybe_unused *phash1,
unsigned char __maybe_unused *phash, const unsigned char *ptarget,
uint32_t max_nonce, uint32_t *last_nonce, uint32_t n)
{
uint32_t *nonce = (uint32_t *)(pdata + 76);
uint32_t data[20];
uint32_t tmp_hash7;
uint32_t Htarg = le32toh(((const uint32_t *)ptarget)[7]);
bool ret = false;
be32enc_vect(data, (const uint32_t *)pdata, 19);
while (1)
{
uint32_t ostate[8];
*nonce = ++n;
data[19] = (n);
yescrypt_hash((unsigned char*)data, (unsigned char*)ostate);
tmp_hash7 = (ostate[7]);
applog(LOG_INFO, "data7 %08lx", (long unsigned int)data[7]);
if (unlikely(tmp_hash7 <= Htarg))
{
((uint32_t *)pdata)[19] = htobe32(n);
*last_nonce = n;
ret = true;
break;
}
if (unlikely((n >= max_nonce) || thr->work_restart))
{
*last_nonce = n;
break;
}
}
return ret;
}

10
algorithm/yescrypt.h

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
#ifndef YESCRYPT_H
#define YESCRYPT_H
#include "miner.h"
#define YESCRYPT_SCRATCHBUF_SIZE (128 * 2048 * 8 ) //uchar
#define YESCRYP_SECBUF_SIZE (128*64*8)
extern int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
extern void yescrypt_regenhash(struct work *work);
#endif /* YESCRYPT_H */

376
algorithm/yescrypt_core.h

@ -0,0 +1,376 @@ @@ -0,0 +1,376 @@
/*-
* Copyright 2009 Colin Percival
* Copyright 2013,2014 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* This file was originally written by Colin Percival as part of the Tarsnap
* online backup system.
*/
#ifndef _YESCRYPT_H_
#define _YESCRYPT_H_
#include <stdint.h>
#include <stdlib.h> /* for size_t */
#include <errno.h>
#ifdef __cplusplus
extern "C" {
#endif
//extern void yescrypt_hash_sp(const unsigned char *input, unsigned char *output);
extern void yescrypt_hash(const unsigned char *input, unsigned char *output);
/**
* crypto_scrypt(passwd, passwdlen, salt, saltlen, N, r, p, buf, buflen):
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
* p, buflen) and write the result into buf. The parameters r, p, and buflen
* must satisfy r * p < 2^30 and buflen <= (2^32 - 1) * 32. The parameter N
* must be a power of 2 greater than 1.
*
* Return 0 on success; or -1 on error.
*
* MT-safe as long as buf is local to the thread.
*/
extern int crypto_scrypt(const uint8_t * __passwd, size_t __passwdlen,
const uint8_t * __salt, size_t __saltlen,
uint64_t __N, uint32_t __r, uint32_t __p,
uint8_t * __buf, size_t __buflen);
/**
* Internal type used by the memory allocator. Please do not use it directly.
* Use yescrypt_shared_t and yescrypt_local_t as appropriate instead, since
* they might differ from each other in a future version.
*/
typedef struct {
void * base, * aligned;
size_t base_size, aligned_size;
} yescrypt_region_t;
/**
* Types for shared (ROM) and thread-local (RAM) data structures.
*/
typedef yescrypt_region_t yescrypt_shared1_t;
typedef struct {
yescrypt_shared1_t shared1;
uint32_t mask1;
} yescrypt_shared_t;
typedef yescrypt_region_t yescrypt_local_t;
/**
* Possible values for yescrypt_init_shared()'s flags argument.
*/
typedef enum {
YESCRYPT_SHARED_DEFAULTS = 0,
YESCRYPT_SHARED_PREALLOCATED = 0x100
} yescrypt_init_shared_flags_t;
/**
* Possible values for the flags argument of yescrypt_kdf(),
* yescrypt_gensalt_r(), yescrypt_gensalt(). These may be OR'ed together,
* except that YESCRYPT_WORM and YESCRYPT_RW are mutually exclusive.
* Please refer to the description of yescrypt_kdf() below for the meaning of
* these flags.
*/
typedef enum {
/* public */
YESCRYPT_WORM = 0,
YESCRYPT_RW = 1,
YESCRYPT_PARALLEL_SMIX = 2,
YESCRYPT_PWXFORM = 4,
/* private */
__YESCRYPT_INIT_SHARED_1 = 0x10000,
__YESCRYPT_INIT_SHARED_2 = 0x20000,
__YESCRYPT_INIT_SHARED = 0x30000
} yescrypt_flags_t;
#define YESCRYPT_KNOWN_FLAGS \
(YESCRYPT_RW | YESCRYPT_PARALLEL_SMIX | YESCRYPT_PWXFORM | \
__YESCRYPT_INIT_SHARED)
/**
* yescrypt_init_shared(shared, param, paramlen, N, r, p, flags, mask,
* buf, buflen):
* Optionally allocate memory for and initialize the shared (ROM) data
* structure. The parameters N, r, and p must satisfy the same conditions as
* with crypto_scrypt(). param and paramlen specify a local parameter with
* which the ROM is seeded. If buf is not NULL, then it is used to return
* buflen bytes of message digest for the initialized ROM (the caller may use
* this to verify that the ROM has been computed in the same way that it was on
* a previous run).
*
* Return 0 on success; or -1 on error.
*
* If bit YESCRYPT_SHARED_PREALLOCATED in flags is set, then memory for the
* ROM is assumed to have been preallocated by the caller, with
* shared->shared1.aligned being the start address of the ROM and
* shared->shared1.aligned_size being its size (which must be consistent with
* N, r, and p). This may be used e.g. when the ROM is to be placed in a SysV
* shared memory segment allocated by the caller.
*
* mask controls the frequency of ROM accesses by yescrypt_kdf(). Normally it
* should be set to 1, to interleave RAM and ROM accesses, which works well
* when both regions reside in the machine's RAM anyway. Other values may be
* used e.g. when the ROM is memory-mapped from a disk file. Recommended mask
* values are powers of 2 minus 1 or minus 2. Here's the effect of some mask
* values:
* mask value ROM accesses in SMix 1st loop ROM accesses in SMix 2nd loop
* 0 0 1/2
* 1 1/2 1/2
* 2 0 1/4
* 3 1/4 1/4
* 6 0 1/8
* 7 1/8 1/8
* 14 0 1/16
* 15 1/16 1/16
* 1022 0 1/1024
* 1023 1/1024 1/1024
*
* Actual computation of the ROM contents may be avoided, if you don't intend
* to use a ROM but need a dummy shared structure, by calling this function
* with NULL, 0, 0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0 for the
* arguments starting with param and on.
*
* MT-safe as long as shared is local to the thread.
*/
extern int yescrypt_init_shared(yescrypt_shared_t * __shared,
const uint8_t * __param, size_t __paramlen,
uint64_t __N, uint32_t __r, uint32_t __p,
yescrypt_init_shared_flags_t __flags, uint32_t __mask,
uint8_t * __buf, size_t __buflen);
/**
* yescrypt_free_shared(shared):
* Free memory that had been allocated with yescrypt_init_shared().
*
* Return 0 on success; or -1 on error.
*
* MT-safe as long as shared is local to the thread.
*/
extern int yescrypt_free_shared(yescrypt_shared_t * __shared);
/**
* yescrypt_init_local(local):
* Initialize the thread-local (RAM) data structure. Actual memory allocation
* is currently fully postponed until a call to yescrypt_kdf() or yescrypt_r().
*
* Return 0 on success; or -1 on error.
*
* MT-safe as long as local is local to the thread.
*/
extern int yescrypt_init_local(yescrypt_local_t * __local);
/**
* yescrypt_free_local(local):
* Free memory that may have been allocated for an initialized thread-local
* (RAM) data structure.
*
* Return 0 on success; or -1 on error.
*
* MT-safe as long as local is local to the thread.
*/
extern int yescrypt_free_local(yescrypt_local_t * __local);
/**
* yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
* N, r, p, t, flags, buf, buflen):
* Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r,
* p, buflen), or a revision of scrypt as requested by flags and shared, and
* write the result into buf. The parameters N, r, p, and buflen must satisfy
* the same conditions as with crypto_scrypt(). t controls computation time
* while not affecting peak memory usage. shared and flags may request
* special modes as described below. local is the thread-local data
* structure, allowing to preserve and reuse a memory allocation across calls,
* thereby reducing its overhead.
*
* Return 0 on success; or -1 on error.
*
* t controls computation time. t = 0 is optimal in terms of achieving the
* highest area-time for ASIC attackers. Thus, higher computation time, if
* affordable, is best achieved by increasing N rather than by increasing t.
* However, if the higher memory usage (which goes along with higher N) is not
* affordable, or if fine-tuning of the time is needed (recall that N must be a
* power of 2), then t = 1 or above may be used to increase time while staying
* at the same peak memory usage. t = 1 increases the time by 25% and
* decreases the normalized area-time to 96% of optimal. (Of course, in
* absolute terms the area-time increases with higher t. It's just that it
* would increase slightly more with higher N*r rather than with higher t.)
* t = 2 increases the time by another 20% and decreases the normalized
* area-time to 89% of optimal. Thus, these two values are reasonable to use
* for fine-tuning. Values of t higher than 2 result in further increase in
* time while reducing the efficiency much further (e.g., down to around 50% of
* optimal for t = 5, which runs 3 to 4 times slower than t = 0, with exact
* numbers varying by the flags settings).
*
* Classic scrypt is available by setting t = 0 and flags to YESCRYPT_WORM and
* passing a dummy shared structure (see the description of
* yescrypt_init_shared() above for how to produce one). In this mode, the
* thread-local memory region (RAM) is first sequentially written to and then
* randomly read from. This algorithm is friendly towards time-memory
* tradeoffs (TMTO), available both to defenders (albeit not in this
* implementation) and to attackers.
*
* Setting YESCRYPT_RW adds extra random reads and writes to the thread-local
* memory region (RAM), which makes TMTO a lot less efficient. This may be
* used to slow down the kinds of attackers who would otherwise benefit from
* classic scrypt's efficient TMTO. Since classic scrypt's TMTO allows not
* only for the tradeoff, but also for a decrease of attacker's area-time (by
* up to a constant factor), setting YESCRYPT_RW substantially increases the
* cost of attacks in area-time terms as well. Yet another benefit of it is
* that optimal area-time is reached at an earlier time than with classic
* scrypt, and t = 0 actually corresponds to this earlier completion time,
* resulting in quicker hash computations (and thus in higher request rate
* capacity). Due to these properties, YESCRYPT_RW should almost always be
* set, except when compatibility with classic scrypt or TMTO-friendliness are
* desired.
*
* YESCRYPT_PARALLEL_SMIX moves parallelism that is present with p > 1 to a
* lower level as compared to where it is in classic scrypt. This reduces
* flexibility for efficient computation (for both attackers and defenders) by
* requiring that, short of resorting to TMTO, the full amount of memory be
* allocated as needed for the specified p, regardless of whether that
* parallelism is actually being fully made use of or not. (For comparison, a
* single instance of classic scrypt may be computed in less memory without any
* CPU time overhead, but in more real time, by not making full use of the
* parallelism.) This may be desirable when the defender has enough memory
* with sufficiently low latency and high bandwidth for efficient full parallel
* execution, yet the required memory size is high enough that some likely
* attackers might end up being forced to choose between using higher latency
* memory than they could use otherwise (waiting for data longer) or using TMTO
* (waiting for data more times per one hash computation). The area-time cost
* for other kinds of attackers (who would use the same memory type and TMTO
* factor or no TMTO either way) remains roughly the same, given the same
* running time for the defender. In the TMTO-friendly YESCRYPT_WORM mode, as
* long as the defender has enough memory that is just as fast as the smaller
* per-thread regions would be, doesn't expect to ever need greater
* flexibility (except possibly via TMTO), and doesn't need backwards
* compatibility with classic scrypt, there are no other serious drawbacks to
* this setting. In the YESCRYPT_RW mode, which is meant to discourage TMTO,
* this new approach to parallelization makes TMTO less inefficient. (This is
* an unfortunate side-effect of avoiding some random writes, as we have to in
* order to allow for parallel threads to access a common memory region without
* synchronization overhead.) Thus, in this mode this setting poses an extra
* tradeoff of its own (higher area-time cost for a subset of attackers vs.
* better TMTO resistance). Setting YESCRYPT_PARALLEL_SMIX also changes the
* way the running time is to be controlled from N*r*p (for classic scrypt) to
* N*r (in this modification). All of this applies only when p > 1. For
* p = 1, this setting is a no-op.
*
* Passing a real shared structure, with ROM contents previously computed by
* yescrypt_init_shared(), enables the use of ROM and requires YESCRYPT_RW for
* the thread-local RAM region. In order to allow for initialization of the
* ROM to be split into a separate program, the shared->shared1.aligned and
* shared->shared1.aligned_size fields may be set by the caller of
* yescrypt_kdf() manually rather than with yescrypt_init_shared().
*
* local must be initialized with yescrypt_init_local().
*
* MT-safe as long as local and buf are local to the thread.
*/
extern int yescrypt_kdf(const yescrypt_shared_t * __shared,
yescrypt_local_t * __local,
const uint8_t * __passwd, size_t __passwdlen,
const uint8_t * __salt, size_t __saltlen,
uint64_t __N, uint32_t __r, uint32_t __p, uint32_t __t,
yescrypt_flags_t __flags,
uint8_t * __buf, size_t __buflen);
/**
* yescrypt_r(shared, local, passwd, passwdlen, setting, buf, buflen):
* Compute and encode an scrypt or enhanced scrypt hash of passwd given the
* parameters and salt value encoded in setting. If the shared structure is
* not dummy, a ROM is used and YESCRYPT_RW is required. Otherwise, whether to
* use the YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
* discouraging modification) is determined by the setting string. shared and
* local must be initialized as described above for yescrypt_kdf(). buf must
* be large enough (as indicated by buflen) to hold the encoded hash string.
*
* Return the encoded hash string on success; or NULL on error.
*
* MT-safe as long as local and buf are local to the thread.
*/
extern uint8_t * yescrypt_r(const yescrypt_shared_t * __shared,
yescrypt_local_t * __local,
const uint8_t * __passwd, size_t __passwdlen,
const uint8_t * __setting,
uint8_t * __buf, size_t __buflen);
/**
* yescrypt(passwd, setting):
* Compute and encode an scrypt or enhanced scrypt hash of passwd given the
* parameters and salt value encoded in setting. Whether to use the
* YESCRYPT_WORM (classic scrypt) or YESCRYPT_RW (time-memory tradeoff
* discouraging modification) is determined by the setting string.
*
* Return the encoded hash string on success; or NULL on error.
*
* This is a crypt(3)-like interface, which is simpler to use than
* yescrypt_r(), but it is not MT-safe, it does not allow for the use of a ROM,
* and it is slower than yescrypt_r() for repeated calls because it allocates
* and frees memory on each call.
*
* MT-unsafe.
*/
extern uint8_t * yescrypt(const uint8_t * __passwd, const uint8_t * __setting);
/**
* yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen, buf, buflen):
* Generate a setting string for use with yescrypt_r() and yescrypt() by
* encoding into it the parameters N_log2 (which is to be set to base 2
* logarithm of the desired value for N), r, p, flags, and a salt given by src
* (of srclen bytes). buf must be large enough (as indicated by buflen) to
* hold the setting string.
*
* Return the setting string on success; or NULL on error.
*
* MT-safe as long as buf is local to the thread.
*/
extern uint8_t * yescrypt_gensalt_r(
uint32_t __N_log2, uint32_t __r, uint32_t __p,
yescrypt_flags_t __flags,
const uint8_t * __src, size_t __srclen,
uint8_t * __buf, size_t __buflen);
/**
* yescrypt_gensalt(N_log2, r, p, flags, src, srclen):
* Generate a setting string for use with yescrypt_r() and yescrypt(). This
* function is the same as yescrypt_gensalt_r() except that it uses a static
* buffer and thus is not MT-safe.
*
* Return the setting string on success; or NULL on error.
*
* MT-unsafe.
*/
extern uint8_t * yescrypt_gensalt(
uint32_t __N_log2, uint32_t __r, uint32_t __p,
yescrypt_flags_t __flags,
const uint8_t * __src, size_t __srclen);
#ifdef __cplusplus
}
#endif
#endif /* !_YESCRYPT_H_ */

360
algorithm/yescryptcommon.c

@ -0,0 +1,360 @@ @@ -0,0 +1,360 @@
/*-
* Copyright 2013,2014 Alexander Peslyak
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "algorithm/yescrypt_core.h"
#define BYTES2CHARS(bytes) \
((((bytes) * 8) + 5) / 6)
#define HASH_SIZE 32 /* bytes */
#define HASH_LEN BYTES2CHARS(HASH_SIZE) /* base-64 chars */
#define YESCRYPT_FLAGS (YESCRYPT_RW | YESCRYPT_PWXFORM)
static const char * const itoa64 =
"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
static uint8_t * encode64_uint32(uint8_t * dst, size_t dstlen,
uint32_t src, uint32_t srcbits)
{
uint32_t bit;
for (bit = 0; bit < srcbits; bit += 6) {
if (dstlen < 1)
return NULL;
*dst++ = itoa64[src & 0x3f];
dstlen--;
src >>= 6;
}
return dst;
}
static uint8_t * encode64(uint8_t * dst, size_t dstlen,
const uint8_t * src, size_t srclen)
{
size_t i;
for (i = 0; i < srclen; ) {
uint8_t * dnext;
uint32_t value = 0, bits = 0;
do {
value |= (uint32_t)src[i++] << bits;
bits += 8;
} while (bits < 24 && i < srclen);
dnext = encode64_uint32(dst, dstlen, value, bits);
if (!dnext)
return NULL;
dstlen -= dnext - dst;
dst = dnext;
}
return dst;
}
static int decode64_one(uint32_t * dst, uint8_t src)
{
const char * ptr = strchr(itoa64, src);
if (ptr) {
*dst = ptr - itoa64;
return 0;
}
*dst = 0;
return -1;
}
static const uint8_t * decode64_uint32(uint32_t * dst, uint32_t dstbits,
const uint8_t * src)
{
uint32_t bit;
uint32_t value;
value = 0;
for (bit = 0; bit < dstbits; bit += 6) {
uint32_t one;
if (decode64_one(&one, *src)) {
*dst = 0;
return NULL;
}
src++;
value |= one << bit;
}
*dst = value;
return src;
}
uint8_t *
yescrypt_r(const yescrypt_shared_t * shared, yescrypt_local_t * local,
const uint8_t * passwd, size_t passwdlen,
const uint8_t * setting,
uint8_t * buf, size_t buflen)
{
uint8_t hash[HASH_SIZE];
const uint8_t * src, * salt;
uint8_t * dst;
size_t prefixlen, saltlen, need;
uint8_t version;
uint64_t N;
uint32_t r, p;
yescrypt_flags_t flags = YESCRYPT_WORM;
fflush(stdout);
if (setting[0] != '$' || setting[1] != '7')
{
fflush(stdout);
return NULL;
}
fflush(stdout);
src = setting + 2;
fflush(stdout);
switch ((version = *src)) {
case '$':
fflush(stdout);
break;
case 'X':
src++;
flags = YESCRYPT_RW;
fflush(stdout);
break;
default:
{
fflush(stdout);
return NULL;
}
}
fflush(stdout);
if (*src != '$') {
uint32_t decoded_flags;
if (decode64_one(&decoded_flags, *src))
{
fflush(stdout);
return NULL;
}
flags = decoded_flags;
if (*++src != '$')
{
fflush(stdout);
return NULL;
}
}
src++;
{
uint32_t N_log2;
if (decode64_one(&N_log2, *src))
{
return NULL;
}
src++;
N = (uint64_t)1 << N_log2;
}
src = decode64_uint32(&r, 30, src);
if (!src)
{
return NULL;
}
src = decode64_uint32(&p, 30, src);
if (!src)
{
return NULL;
}
prefixlen = src - setting;
salt = src;
src = (uint8_t *)strrchr((char *)salt, '$');
if (src)
saltlen = src - salt;
else
saltlen = strlen((char *)salt);
need = prefixlen + saltlen + 1 + HASH_LEN + 1;
if (need > buflen || need < saltlen)
{
fflush(stdout);
return NULL;
}
fflush(stdout);
if (yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen,
N, r, p, 0, flags, hash, sizeof(hash)))
{
fflush(stdout);
return NULL;
}
dst = buf;
memcpy(dst, setting, prefixlen + saltlen);
dst += prefixlen + saltlen;
*dst++ = '$';
dst = encode64(dst, buflen - (dst - buf), hash, sizeof(hash));
/* Could zeroize hash[] here, but yescrypt_kdf() doesn't zeroize its
* memory allocations yet anyway. */
if (!dst || dst >= buf + buflen) /* Can't happen */
{
return NULL;
}
*dst = 0; /* NUL termination */
fflush(stdout);
return buf;
}
uint8_t *
yescrypt(const uint8_t * passwd, const uint8_t * setting)
{
static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1 + HASH_LEN + 1];
yescrypt_shared_t shared;
yescrypt_local_t local;
uint8_t * retval;
if (yescrypt_init_shared(&shared, NULL, 0,
0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
return NULL;
if (yescrypt_init_local(&local)) {
yescrypt_free_shared(&shared);
return NULL;
}
retval = yescrypt_r(&shared, &local,
passwd, 80, setting, buf, sizeof(buf));
// printf("hashse='%s'\n", (char *)retval);
if (yescrypt_free_local(&local)) {
yescrypt_free_shared(&shared);
return NULL;
}
if (yescrypt_free_shared(&shared))
return NULL;
return retval;
}
uint8_t *
yescrypt_gensalt_r(uint32_t N_log2, uint32_t r, uint32_t p,
yescrypt_flags_t flags,
const uint8_t * src, size_t srclen,
uint8_t * buf, size_t buflen)
{
uint8_t * dst;
size_t prefixlen = 3 + 1 + 5 + 5;
size_t saltlen = BYTES2CHARS(srclen);
size_t need;
if (p == 1)
flags &= ~YESCRYPT_PARALLEL_SMIX;
if (flags) {
if (flags & ~0x3f)
return NULL;
prefixlen++;
if (flags != YESCRYPT_RW)
prefixlen++;
}
need = prefixlen + saltlen + 1;
if (need > buflen || need < saltlen || saltlen < srclen)
return NULL;
if (N_log2 > 63 || ((uint64_t)r * (uint64_t)p >= (1U << 30)))
return NULL;
dst = buf;
*dst++ = '$';
*dst++ = '7';
if (flags) {
*dst++ = 'X'; /* eXperimental, subject to change */
if (flags != YESCRYPT_RW)
*dst++ = itoa64[flags];
}
*dst++ = '$';
*dst++ = itoa64[N_log2];
dst = encode64_uint32(dst, buflen - (dst - buf), r, 30);
if (!dst) /* Can't happen */
return NULL;
dst = encode64_uint32(dst, buflen - (dst - buf), p, 30);
if (!dst) /* Can't happen */
return NULL;
dst = encode64(dst, buflen - (dst - buf), src, srclen);
if (!dst || dst >= buf + buflen) /* Can't happen */
return NULL;
*dst = 0; /* NUL termination */
return buf;
}
uint8_t *
yescrypt_gensalt(uint32_t N_log2, uint32_t r, uint32_t p,
yescrypt_flags_t flags,
const uint8_t * src, size_t srclen)
{
static uint8_t buf[4 + 1 + 5 + 5 + BYTES2CHARS(32) + 1];
return yescrypt_gensalt_r(N_log2, r, p, flags, src, srclen,
buf, sizeof(buf));
}
static int
yescrypt_bsty(const uint8_t * passwd, size_t passwdlen,
const uint8_t * salt, size_t saltlen, uint64_t N, uint32_t r, uint32_t p,
uint8_t * buf, size_t buflen)
{
static __thread int initialized = 0;
static __thread yescrypt_shared_t shared;
static __thread yescrypt_local_t local;
// static __declspec(thread) int initialized = 0;
// static __declspec(thread) yescrypt_shared_t shared;
// static __declspec(thread) yescrypt_local_t local;
int retval;
if (!initialized) {
/* "shared" could in fact be shared, but it's simpler to keep it private
* along with "local". It's dummy and tiny anyway. */
if (yescrypt_init_shared(&shared, NULL, 0,
0, 0, 0, YESCRYPT_SHARED_DEFAULTS, 0, NULL, 0))
return -1;
if (yescrypt_init_local(&local)) {
yescrypt_free_shared(&shared);
return -1;
}
initialized = 1;
}
retval = yescrypt_kdf(&shared, &local,
passwd, passwdlen, salt, saltlen, N, r, p, 0, YESCRYPT_FLAGS,
buf, buflen);
return retval;
}
void yescrypt_hash(const unsigned char *input, unsigned char *output)
{
yescrypt_bsty((const uint8_t *)input, 80, (const uint8_t *) input, 80, 2048, 8, 1, (uint8_t *)output, 32);
}

13
driver-opencl.c

@ -257,14 +257,14 @@ char *set_gpu_threads(const char *_arg) @@ -257,14 +257,14 @@ char *set_gpu_threads(const char *_arg)
if (nextptr == NULL)
return "Invalid parameters for set_gpu_threads";
val = atoi(nextptr);
if (val < 1 || val > 10)
if (val < 1 || val > 20) // gpu_threads increase max value to 20
return "Invalid value passed to set_gpu_threads";
gpus[device++].threads = val;
while ((nextptr = strtok(NULL, ",")) != NULL) {
val = atoi(nextptr);
if (val < 1 || val > 10)
if (val < 1 || val > 20) // gpu_threads increase max value to 20
return "Invalid value passed to set_gpu_threads";
gpus[device++].threads = val;
@ -1472,6 +1472,9 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, @@ -1472,6 +1472,9 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
}
applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
postcalc_hash_async(thr, work, thrdata->res);
// postcalc_hash(thr);
// submit_tested_work(thr, work);
// submit_work_async(work);
memset(thrdata->res, 0, buffersize);
/* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */
clFinish(clState->commandQueue);
@ -1493,6 +1496,12 @@ static void opencl_thread_shutdown(struct thr_info *thr) @@ -1493,6 +1496,12 @@ static void opencl_thread_shutdown(struct thr_info *thr)
clFinish(clState->commandQueue);
clReleaseMemObject(clState->outputBuffer);
clReleaseMemObject(clState->CLbuffer0);
if (clState->buffer1)
clReleaseMemObject(clState->buffer1);
if (clState->buffer2)
clReleaseMemObject(clState->buffer2);
if (clState->buffer3)
clReleaseMemObject(clState->buffer3);
if (clState->padbuffer8)
clReleaseMemObject(clState->padbuffer8);
clReleaseKernel(clState->kernel);

6
findnonce.c

@ -214,6 +214,7 @@ static void *postcalc_hash(void *userdata) @@ -214,6 +214,7 @@ static void *postcalc_hash(void *userdata)
void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res)
{
struct pc_data *pcd = (struct pc_data *)malloc(sizeof(struct pc_data));
int buffersize;
@ -225,8 +226,7 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res) @@ -225,8 +226,7 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res)
pcd->thr = thr;
pcd->work = copy_work(work);
buffersize = BUFFERSIZE;
memcpy(&pcd->res, res, buffersize);
memcpy(&pcd->res, res, buffersize);
if (pthread_create(&pcd->pth, NULL, postcalc_hash, (void *)pcd)) {
applog(LOG_ERR, "Failed to create postcalc_hash thread");
@ -366,4 +366,4 @@ void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) @@ -366,4 +366,4 @@ void precalc_hash_blake256(dev_blk_ctx *blk, uint32_t *state, uint32_t *data)
blk->cty_a = data[16];
blk->cty_b = data[17];
blk->cty_c = data[18];
}
}

162
kernel/bmw256.cl

@ -0,0 +1,162 @@ @@ -0,0 +1,162 @@
/*
* bmw256 kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
* Copyright (c) 2015 djm34
*
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
#define shl(x, n) ((x) << (n))
#define shr(x, n) ((x) >> (n))
//#define SHR(x, n) SHR2(x, n)
//#define SHL(x, n) SHL2(x, n)
#define SPH_ROTL32(x,n) rotate(x,(uint)n)
#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19))
#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23))
#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
#define ss4(x) (shr((x), 1) ^ (x))
#define ss5(x) (shr((x), 2) ^ (x))
#define rs1(x) SPH_ROTL32((x), 3)
#define rs2(x) SPH_ROTL32((x), 7)
#define rs3(x) SPH_ROTL32((x), 13)
#define rs4(x) SPH_ROTL32((x), 16)
#define rs5(x) SPH_ROTL32((x), 19)
#define rs6(x) SPH_ROTL32((x), 23)
#define rs7(x) SPH_ROTL32((x), 27)
/* Message expansion function 1 */
uint expand32_1(int i, uint *M32, uint *H, uint *Q)
{
return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
}
/* Message expansion function 2 */
uint expand32_2(int i, uint *M32, uint *H, uint *Q)
{
return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
}
void Compression256(uint *M32, uint *H)
{
int i;
uint XL32, XH32, Q[32];
Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
/* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
Q[0] = ss0(Q[0]) + H[1];
Q[1] = ss1(Q[1]) + H[2];
Q[2] = ss2(Q[2]) + H[3];
Q[3] = ss3(Q[3]) + H[4];
Q[4] = ss4(Q[4]) + H[5];
Q[5] = ss0(Q[5]) + H[6];
Q[6] = ss1(Q[6]) + H[7];
Q[7] = ss2(Q[7]) + H[8];
Q[8] = ss3(Q[8]) + H[9];
Q[9] = ss4(Q[9]) + H[10];
Q[10] = ss0(Q[10]) + H[11];
Q[11] = ss1(Q[11]) + H[12];
Q[12] = ss2(Q[12]) + H[13];
Q[13] = ss3(Q[13]) + H[14];
Q[14] = ss4(Q[14]) + H[15];
Q[15] = ss0(Q[15]) + H[0];
/* This is the Message expansion or f_1 in the documentation. */
/* It has 16 rounds. */
/* Blue Midnight Wish has two tunable security parameters. */
/* The parameters are named EXPAND_1_ROUNDS and EXPAND_2_ROUNDS. */
/* The following relation for these parameters should is satisfied: */
/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */
#pragma unroll
for (i = 0; i<2; i++)
Q[i + 16] = expand32_1(i + 16, M32, H, Q);
#pragma unroll
for (i = 2; i<16; i++)
Q[i + 16] = expand32_2(i + 16, M32, H, Q);
/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
/* 16 new variables that are prooduced in the Message Expansion part. */
XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
/* This part is the function f_2 - in the documentation */
/* Compute the double chaining pipe for the next message block. */
H[0] = (shl(XH32, 5) ^ shr(Q[16], 5) ^ M32[0]) + (XL32 ^ Q[24] ^ Q[0]);
H[1] = (shr(XH32, 7) ^ shl(Q[17], 8) ^ M32[1]) + (XL32 ^ Q[25] ^ Q[1]);
H[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
H[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
H[4] = (shr(XH32, 3) ^ Q[20] ^ M32[4]) + (XL32 ^ Q[28] ^ Q[4]);
H[5] = (shl(XH32, 6) ^ shr(Q[21], 6) ^ M32[5]) + (XL32 ^ Q[29] ^ Q[5]);
H[6] = (shr(XH32, 4) ^ shl(Q[22], 6) ^ M32[6]) + (XL32 ^ Q[30] ^ Q[6]);
H[7] = (shr(XH32, 11) ^ shl(Q[23], 2) ^ M32[7]) + (XL32 ^ Q[31] ^ Q[7]);
H[8] = SPH_ROTL32(H[4], 9) + (XH32 ^ Q[24] ^ M32[8]) + (shl(XL32, 8) ^ Q[23] ^ Q[8]);
H[9] = SPH_ROTL32(H[5], 10) + (XH32 ^ Q[25] ^ M32[9]) + (shr(XL32, 6) ^ Q[16] ^ Q[9]);
H[10] = SPH_ROTL32(H[6], 11) + (XH32 ^ Q[26] ^ M32[10]) + (shl(XL32, 6) ^ Q[17] ^ Q[10]);
H[11] = SPH_ROTL32(H[7], 12) + (XH32 ^ Q[27] ^ M32[11]) + (shl(XL32, 4) ^ Q[18] ^ Q[11]);
H[12] = SPH_ROTL32(H[0], 13) + (XH32 ^ Q[28] ^ M32[12]) + (shr(XL32, 3) ^ Q[19] ^ Q[12]);
H[13] = SPH_ROTL32(H[1], 14) + (XH32 ^ Q[29] ^ M32[13]) + (shr(XL32, 4) ^ Q[20] ^ Q[13]);
H[14] = SPH_ROTL32(H[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
H[15] = SPH_ROTL32(H[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
}

232
kernel/credits.cl

@ -0,0 +1,232 @@ @@ -0,0 +1,232 @@
/*
* "credits" kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2015 djm34
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
#if !defined(cl_khr_byte_addressable_store)
#error "Device does not support unaligned stores"
#endif
#define ROL32(x, n) rotate(x, (uint) n)
#define SWAP32(a) (as_uint(as_uchar4(a).wzyx))
#define SWAP64(x) as_ulong(as_uchar8(x).s32107654) /// hmm...
#define SHR(x, n) ((x) >> n)
#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3))
#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10))
#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10))
#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7))
#define P(a,b,c,d,e,f,g,h,x,K) \
{ \
temp1 = h + S3(e) + F1(e,f,g) + (K + x); \
d += temp1; h = temp1 + S2(a) + F0(a,b,c); \
}
#define F0(y, x, z) bitselect(z, y, z ^ x)
#define F1(x, y, z) bitselect(z, y, x)
#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0)
#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1)
#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2)
#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3)
#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4)
#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5)
#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6)
#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7)
#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8)
#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9)
#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10)
#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11)
#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12)
#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13)
#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14)
#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15)
#define RD14 (S1(W12) + W7 + S0(W15) + W14)
#define RD15 (S1(W13) + W8 + S0(W0) + W15)
/// generic sha transform
inline uint8 sha256_Transform(uint16 data, uint8 state)
{
uint temp1;
uint8 res = state;
uint W0 = data.s0;
uint W1 = data.s1;
uint W2 = data.s2;
uint W3 = data.s3;
uint W4 = data.s4;
uint W5 = data.s5;
uint W6 = data.s6;
uint W7 = data.s7;
uint W8 = data.s8;
uint W9 = data.s9;
uint W10 = data.sA;
uint W11 = data.sB;
uint W12 = data.sC;
uint W13 = data.sD;
uint W14 = data.sE;
uint W15 = data.sF;
#define v0 res.s0
#define v1 res.s1
#define v2 res.s2
#define v3 res.s3
#define v4 res.s4
#define v5 res.s5
#define v6 res.s6
#define v7 res.s7
P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98);
P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491);
P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF);
P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5);
P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B);
P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1);
P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4);
P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5);
P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98);
P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01);
P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE);
P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3);
P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74);
P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE);
P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7);
P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147);
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351);
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624);
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585);
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB);
P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7);
P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2);
#undef v0
#undef v1
#undef v2
#undef v3
#undef v4
#undef v5
#undef v6
#undef v7
return (res + state);
}
static __constant uint8 H256 = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372,
0xA54FF53A, 0x510E527F, 0x9B05688C,
0x1F83D9AB, 0x5BE0CD19
};
static __constant uint8 pad_data =
{
0x00000000, 0x00000000, 0x80000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000540
};
static __constant uint8 pad_state =
{
0x80000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000100
};
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uchar* restrict input, __global uint* restrict output,const ulong target, uint8 midstate )
{
uint nonce = get_global_id(0);
uint16 in;
uint8 state1;
in.lo = ((__global const uint8 *)input)[4];
in.hi = pad_data;
in.hi.s0 = ((__global const uint *)input)[40];
in.hi.s1 = ((__global const uint *)input)[41];
in.s3 = nonce;
state1 = sha256_Transform(in, midstate);
in.lo = state1;
in.hi = pad_state;
state1 = sha256_Transform(in,H256);
if (SWAP64(state1.s67) <= target)
output[atomic_inc(output + 0xFF)] = nonce;
}

132
kernel/cubehash256.cl

@ -0,0 +1,132 @@ @@ -0,0 +1,132 @@
// cubehash256
// djm34 2015 based on ccminer cubehash512
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
#define LROT(x, bits) rotate( x,(uint) bits)
#define ROTATEUPWARDS7(a) LROT(a,7)
#define ROTATEUPWARDS11(a) LROT(a,11)
#define SWAP(a,b) { uint u = a; a = b; b = u; }
inline void rrounds(uint x[2][2][2][2][2])
{
int r;
int j;
int k;
int l;
int m;
//#pragma unroll 2
for (r = 0; r < CUBEHASH_ROUNDS; ++r) {
/* "add x_0jklm into x_1jklmn modulo 2^32" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
/* "rotate x_0jklm upwards by 7 bits" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
/* "swap x_00klm with x_01klm" */
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
/* "xor x_1jklm into x_0jklm" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jk0m with x_1jk1m" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
/* "add x_0jklm into x_1jklm modulo 2^32" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
/* "rotate x_0jklm upwards by 11 bits" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
/* "swap x_0j0lm with x_0j1lm" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
/* "xor x_1jklm into x_0jklm" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
//#pragma unroll 2
for (m = 0; m < 2; ++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jkl0 with x_1jkl1" */
//#pragma unroll 2
for (j = 0; j < 2; ++j)
//#pragma unroll 2
for (k = 0; k < 2; ++k)
//#pragma unroll 2
for (l = 0; l < 2; ++l)
SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
}
}

525
kernel/lyra2rev2.cl

@ -0,0 +1,525 @@ @@ -0,0 +1,525 @@
/*
* Lyra2RE kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
* Copyright (c) 2014 djm34
* Copyright (c) 2014 James Lovejoy
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
// typedef unsigned int uint;
#pragma OPENCL EXTENSION cl_amd_printf : enable
#ifndef LYRA2RE_CL
#define LYRA2RE_CL
#if __ENDIAN_LITTLE__
#define SPH_LITTLE_ENDIAN 1
#else
#define SPH_BIG_ENDIAN 1
#endif
#define SPH_UPTR sph_u64
typedef unsigned int sph_u32;
typedef int sph_s32;
#ifndef __OPENCL_VERSION__
typedef unsigned long sph_u64;
typedef long sph_s64;
#else
typedef unsigned long sph_u64;
typedef long sph_s64;
#endif
#define SPH_64 1
#define SPH_64_TRUE 1
#define SPH_C32(x) ((sph_u32)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define SPH_C64(x) ((sph_u64)(x ## UL))
#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
//#define SPH_ROTL32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
//#define SPH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
//#define SPH_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
//#define SPH_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#define SPH_ROTL32(x,n) rotate(x,(uint)n) //faster with driver 14.6
#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n))
#define SPH_ROTL64(x,n) rotate(x,(ulong)n)
#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n))
static inline sph_u64 ror64(sph_u64 vw, unsigned a) {
uint2 result;
uint2 v = as_uint2(vw);
unsigned n = (unsigned)(64 - a);
if (n == 32) { return as_ulong((uint2)(v.y, v.x)); }
if (n < 32) {
result.y = ((v.y << (n)) | (v.x >> (32 - n)));
result.x = ((v.x << (n)) | (v.y >> (32 - n)));
}
else {
result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
}
return as_ulong(result);
}
//#define SPH_ROTR64(l,n) ror64(l,n)
#define memshift 3
#include "blake256.cl"
#include "lyra2v2.cl"
#include "keccak1600.cl"
#include "skein256.cl"
#include "cubehash.cl"
#include "bmw256.cl"
#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
//#define SWAP8(x) as_ulong(as_uchar8(x).s32107654)
#if SPH_BIG_ENDIAN
#define DEC64E(x) (x)
#define DEC64BE(x) (*(const __global sph_u64 *) (x));
#define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x));
#define DEC32LE(x) (*(const __global sph_u32 *) (x));
#else
#define DEC64E(x) SWAP8(x)
#define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
#define DEC64LE(x) (*(const __global sph_u64 *) (x));
#define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x));
#endif
typedef union {
unsigned char h1[32];
uint h4[8];
ulong h8[4];
} hash_t;
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(
__global uchar* hashes,
// precalc hash from fisrt part of message
const uint h0,
const uint h1,
const uint h2,
const uint h3,
const uint h4,
const uint h5,
const uint h6,
const uint h7,
// last 12 bytes of original message
const uint in16,
const uint in17,
const uint in18
)
{
uint gid = get_global_id(0);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
unsigned int h[8];
unsigned int m[16];
unsigned int v[16];
h[0]=h0;
h[1]=h1;
h[2]=h2;
h[3]=h3;
h[4]=h4;
h[5]=h5;
h[6]=h6;
h[7]=h7;
// compress 2nd round
m[0] = in16;
m[1] = in17;
m[2] = in18;
m[3] = SWAP4(gid);
for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];}
for (int i = 0; i < 8; i++) {v[i] = h[i];}
v[8] = c_u256[0];
v[9] = c_u256[1];
v[10] = c_u256[2];
v[11] = c_u256[3];
v[12] = c_u256[4] ^ 640;
v[13] = c_u256[5] ^ 640;
v[14] = c_u256[6];
v[15] = c_u256[7];
for (int r = 0; r < 14; r++) {
GS(0, 4, 0x8, 0xC, 0x0);
GS(1, 5, 0x9, 0xD, 0x2);
GS(2, 6, 0xA, 0xE, 0x4);
GS(3, 7, 0xB, 0xF, 0x6);
GS(0, 5, 0xA, 0xF, 0x8);
GS(1, 6, 0xB, 0xC, 0xA);
GS(2, 7, 0x8, 0xD, 0xC);
GS(3, 4, 0x9, 0xE, 0xE);
}
for (int i = 0; i < 16; i++) {
int j = i & 7;
h[j] ^= v[i];}
for (int i=0;i<8;i++) {hash->h4[i]=SWAP4(h[i]);}
barrier(CLK_LOCAL_MEM_FENCE);
}
// keccak256
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search1(__global uchar* hashes)
{
uint gid = get_global_id(0);
// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
sph_u64 keccak_gpu_state[25];
for (int i = 0; i<25; i++) {
if (i<4) { keccak_gpu_state[i] = hash->h8[i]; }
else { keccak_gpu_state[i] = 0; }
}
keccak_gpu_state[4] = 0x0000000000000001;
keccak_gpu_state[16] = 0x8000000000000000;
keccak_block(keccak_gpu_state);
for (int i = 0; i<4; i++) { hash->h8[i] = keccak_gpu_state[i]; }
barrier(CLK_LOCAL_MEM_FENCE);
}
// cubehash256
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search2(__global uchar* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71;
sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63;
sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696;
sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C;
sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00;
sph_u32 xf = 0x40E5FBAB;
sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5;
sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549;
sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285;
sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD;
sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6;
sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A;
x0 ^= (hash->h4[0]);
x1 ^= (hash->h4[1]);
x2 ^= (hash->h4[2]);
x3 ^= (hash->h4[3]);
x4 ^= (hash->h4[4]);
x5 ^= (hash->h4[5]);
x6 ^= (hash->h4[6]);
x7 ^= (hash->h4[7]);
SIXTEEN_ROUNDS;
x0 ^= 0x80;
SIXTEEN_ROUNDS;
xv ^= 0x01;
for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS;
hash->h4[0] = x0;
hash->h4[1] = x1;
hash->h4[2] = x2;
hash->h4[3] = x3;
hash->h4[4] = x4;
hash->h4[5] = x5;
hash->h4[6] = x6;
hash->h4[7] = x7;
barrier(CLK_GLOBAL_MEM_FENCE);
}
/// lyra2 algo
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search3(__global uchar* hashes,__global uchar* matrix )
{
uint gid = get_global_id(0);
// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong4 *DMatrix = (__global ulong4 *)(matrix + (4 * memshift * 4 * 4 * 8 * (get_global_id(0) % MAX_GLOBAL_THREADS)));
// uint offset = (4 * memshift * 4 * 4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS))/32;
ulong4 state[4];
state[0].x = hash->h8[0]; //password
state[0].y = hash->h8[1]; //password
state[0].z = hash->h8[2]; //password
state[0].w = hash->h8[3]; //password
state[1] = state[0];
state[2] = (ulong4)(0x6a09e667f3bcc908UL, 0xbb67ae8584caa73bUL, 0x3c6ef372fe94f82bUL, 0xa54ff53a5f1d36f1UL);
state[3] = (ulong4)(0x510e527fade682d1UL, 0x9b05688c2b3e6c1fUL, 0x1f83d9abfb41bd6bUL, 0x5be0cd19137e2179UL);
for (int i = 0; i<12; i++) { round_lyra(state); }
state[0] ^= (ulong4)(0x20,0x20,0x20,0x01);
state[1] ^= (ulong4)(0x04,0x04,0x80,0x0100000000000000);
for (int i = 0; i<12; i++) { round_lyra(state); }
uint ps1 = (memshift * 3);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 - memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix)[j+s1] = state[j];
round_lyra(state);
}
reduceDuplexf(state,DMatrix);
reduceDuplexRowSetupf(1, 0, 2,state, DMatrix);
reduceDuplexRowSetupf(2, 1, 3, state,DMatrix);
uint rowa;
uint prev = 3;
for (uint i = 0; i<4; i++) {
rowa = state[0].x & 3;
reduceDuplexRowf(prev, rowa, i, state, DMatrix);
prev = i;
}
uint shift = (memshift * 4 * rowa);
for (int j = 0; j < 3; j++)
state[j] ^= (DMatrix)[j+shift];
for (int i = 0; i < 12; i++)
round_lyra(state);
//////////////////////////////////////
for (int i = 0; i<4; i++) {hash->h8[i] = ((ulong*)state)[i];}
barrier(CLK_LOCAL_MEM_FENCE);
}
//skein256
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search4(__global uchar* hashes)
{
uint gid = get_global_id(0);
// __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
sph_u64 h[9];
sph_u64 t[3];
sph_u64 dt0,dt1,dt2,dt3;
sph_u64 p0, p1, p2, p3, p4, p5, p6, p7;
h[8] = skein_ks_parity;
for (int i = 0; i<8; i++) {
h[i] = SKEIN_IV512_256[i];
h[8] ^= h[i];}
t[0]=t12[0];
t[1]=t12[1];
t[2]=t12[2];
dt0=hash->h8[0];
dt1=hash->h8[1];
dt2=hash->h8[2];
dt3=hash->h8[3];
p0 = h[0] + dt0;
p1 = h[1] + dt1;
p2 = h[2] + dt2;
p3 = h[3] + dt3;
p4 = h[4];
p5 = h[5] + t[0];
p6 = h[6] + t[1];
p7 = h[7];
#pragma unroll
for (int i = 1; i<19; i+=2) {Round_8_512(p0,p1,p2,p3,p4,p5,p6,p7,i);}
p0 ^= dt0;
p1 ^= dt1;
p2 ^= dt2;
p3 ^= dt3;
h[0] = p0;
h[1] = p1;
h[2] = p2;
h[3] = p3;
h[4] = p4;
h[5] = p5;
h[6] = p6;
h[7] = p7;
h[8] = skein_ks_parity;
for (int i = 0; i<8; i++) { h[8] ^= h[i]; }
t[0] = t12[3];
t[1] = t12[4];
t[2] = t12[5];
p5 += t[0]; //p5 already equal h[5]
p6 += t[1];
#pragma unroll
for (int i = 1; i<19; i+=2) { Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i); }
hash->h8[0] = p0;
hash->h8[1] = p1;
hash->h8[2] = p2;
hash->h8[3] = p3;
barrier(CLK_LOCAL_MEM_FENCE);
}
//cubehash
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search5(__global uchar* hashes)
{
uint gid = get_global_id(0);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
sph_u32 x0 = 0xEA2BD4B4; sph_u32 x1 = 0xCCD6F29F; sph_u32 x2 = 0x63117E71;
sph_u32 x3 = 0x35481EAE; sph_u32 x4 = 0x22512D5B; sph_u32 x5 = 0xE5D94E63;
sph_u32 x6 = 0x7E624131; sph_u32 x7 = 0xF4CC12BE; sph_u32 x8 = 0xC2D0B696;
sph_u32 x9 = 0x42AF2070; sph_u32 xa = 0xD0720C35; sph_u32 xb = 0x3361DA8C;
sph_u32 xc = 0x28CCECA4; sph_u32 xd = 0x8EF8AD83; sph_u32 xe = 0x4680AC00;
sph_u32 xf = 0x40E5FBAB;
sph_u32 xg = 0xD89041C3; sph_u32 xh = 0x6107FBD5;
sph_u32 xi = 0x6C859D41; sph_u32 xj = 0xF0B26679; sph_u32 xk = 0x09392549;
sph_u32 xl = 0x5FA25603; sph_u32 xm = 0x65C892FD; sph_u32 xn = 0x93CB6285;
sph_u32 xo = 0x2AF2B5AE; sph_u32 xp = 0x9E4B4E60; sph_u32 xq = 0x774ABFDD;
sph_u32 xr = 0x85254725; sph_u32 xs = 0x15815AEB; sph_u32 xt = 0x4AB6AAD6;
sph_u32 xu = 0x9CDAF8AF; sph_u32 xv = 0xD6032C0A;
x0 ^= (hash->h4[0]);
x1 ^= (hash->h4[1]);
x2 ^= (hash->h4[2]);
x3 ^= (hash->h4[3]);
x4 ^= (hash->h4[4]);
x5 ^= (hash->h4[5]);
x6 ^= (hash->h4[6]);
x7 ^= (hash->h4[7]);
SIXTEEN_ROUNDS;
x0 ^= 0x80;
SIXTEEN_ROUNDS;
xv ^= 0x01;
for (int i = 0; i < 10; ++i) SIXTEEN_ROUNDS;
hash->h4[0] = x0;
hash->h4[1] = x1;
hash->h4[2] = x2;
hash->h4[3] = x3;
hash->h4[4] = x4;
hash->h4[5] = x5;
hash->h4[6] = x6;
hash->h4[7] = x7;
barrier(CLK_GLOBAL_MEM_FENCE);
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search6(__global uchar* hashes, __global uint* output, const ulong target)
{
uint gid = get_global_id(0);
__global hash_t *hash = (__global hash_t *)(hashes + (4 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
uint dh[16] = {
0x40414243, 0x44454647,
0x48494A4B, 0x4C4D4E4F,
0x50515253, 0x54555657,
0x58595A5B, 0x5C5D5E5F,
0x60616263, 0x64656667,
0x68696A6B, 0x6C6D6E6F,
0x70717273, 0x74757677,
0x78797A7B, 0x7C7D7E7F
};
uint final_s[16] = {
0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2,
0xaaaaaaa3, 0xaaaaaaa4, 0xaaaaaaa5,
0xaaaaaaa6, 0xaaaaaaa7, 0xaaaaaaa8,
0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae,
0xaaaaaaaf
};
uint message[16];
for (int i = 0; i<8; i++) message[i] = hash->h4[i];
for (int i = 9; i<14; i++) message[i] = 0;
message[8]= 0x80;
message[14]=0x100;
message[15]=0;
Compression256(message, dh);
Compression256(dh, final_s);
barrier(CLK_LOCAL_MEM_FENCE);
bool result = ( ((ulong*)final_s)[7] <= target);
if (result) {
output[atomic_inc(output + 0xFF)] = SWAP4(gid);
}
}
#endif // LYRA2RE_CL

184
kernel/lyra2v2.cl

@ -0,0 +1,184 @@ @@ -0,0 +1,184 @@
/*
* Lyra2 kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
* Copyright (c) 2014 djm34
*
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
#define ROTL64(x,n) rotate(x,(ulong)n)
#define ROTR64(x,n) rotate(x,(ulong)(64-n))
#define SWAP32(x) as_ulong(as_uint2(x).s10)
#define SWAP24(x) as_ulong(as_uchar8(x).s34567012)
#define SWAP16(x) as_ulong(as_uchar8(x).s23456701)
#define G(a,b,c,d) \
do { \
a += b; d ^= a; d = SWAP32(d); \
c += d; b ^= c; b = ROTR64(b,24); \
a += b; d ^= a; d = ROTR64(d,16); \
c += d; b ^= c; b = ROTR64(b, 63); \
\
} while (0)
#define G_old(a,b,c,d) \
do { \
a += b; d ^= a; d = ROTR64(d, 32); \
c += d; b ^= c; b = ROTR64(b, 24); \
a += b; d ^= a; d = ROTR64(d, 16); \
c += d; b ^= c; b = ROTR64(b, 63); \
\
} while (0)
/*One Round of the Blake2b's compression function*/
#define round_lyra(s) \
do { \
G(s[0].x, s[1].x, s[2].x, s[3].x); \
G(s[0].y, s[1].y, s[2].y, s[3].y); \
G(s[0].z, s[1].z, s[2].z, s[3].z); \
G(s[0].w, s[1].w, s[2].w, s[3].w); \
G(s[0].x, s[1].y, s[2].z, s[3].w); \
G(s[0].y, s[1].z, s[2].w, s[3].x); \
G(s[0].z, s[1].w, s[2].x, s[3].y); \
G(s[0].w, s[1].x, s[2].y, s[3].z); \
} while(0)
void reduceDuplexf(ulong4* state ,__global ulong4* DMatrix)
{
ulong4 state1[3];
uint ps1 = 0;
uint ps2 = (memshift * 3 + memshift * 4);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 - i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state[j] ^= state1[j];
round_lyra(state);
for (int j = 0; j < 3; j++) state1[j] ^= state[j];
for (int j = 0; j < 3; j++) (DMatrix)[j + s2] = state1[j];
}
}
void reduceDuplexRowf(uint rowIn,uint rowInOut,uint rowOut,ulong4 * state, __global ulong4 * DMatrix)
{
ulong4 state1[3], state2[3];
uint ps1 = (memshift * 4 * rowIn);
uint ps2 = (memshift * 4 * rowInOut);
uint ps3 = (memshift * 4 * rowOut);
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 + i*memshift;
uint s3 = ps3 + i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2];
for (int j = 0; j < 3; j++) state1[j] += state2[j];
for (int j = 0; j < 3; j++) state[j] ^= state1[j];
round_lyra(state);
((ulong*)state2)[0] ^= ((ulong*)state)[11];
for (int j = 0; j < 11; j++)
((ulong*)state2)[j + 1] ^= ((ulong*)state)[j];
if (rowInOut != rowOut) {
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s3] ^= state[j];
}
else {
for (int j = 0; j < 3; j++)
state2[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
}
}
}
void reduceDuplexRowSetupf(uint rowIn, uint rowInOut, uint rowOut, ulong4 *state, __global ulong4* DMatrix) {
ulong4 state2[3], state1[3];
uint ps1 = (memshift * 4 * rowIn);
uint ps2 = (memshift * 4 * rowInOut);
uint ps3 = (memshift * 3 + memshift * 4 * rowOut);
for (int i = 0; i < 4; i++)
{
uint s1 = ps1 + i*memshift;
uint s2 = ps2 + i*memshift;
uint s3 = ps3 - i*memshift;
for (int j = 0; j < 3; j++) state1[j] = (DMatrix)[j + s1];
for (int j = 0; j < 3; j++) state2[j] = (DMatrix)[j + s2];
for (int j = 0; j < 3; j++) {
ulong4 tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
round_lyra(state);
for (int j = 0; j < 3; j++) {
state1[j] ^= state[j];
(DMatrix)[j + s3] = state1[j];
}
((ulong*)state2)[0] ^= ((ulong*)state)[11];
for (int j = 0; j < 11; j++)
((ulong*)state2)[j + 1] ^= ((ulong*)state)[j];
for (int j = 0; j < 3; j++)
(DMatrix)[j + s2] = state2[j];
}
}

314
kernel/yescrypt-multi.cl

@ -0,0 +1,314 @@ @@ -0,0 +1,314 @@
/*
* "yescrypt" kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2015 djm34
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
#if !defined(cl_khr_byte_addressable_store)
#error "Device does not support unaligned stores"
#endif
#include "yescrypt_essential.cl"
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, __global uchar* buff1, __global uchar* buff2, __global uchar* buff3, const uint target)
{
__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
__global uint8 *sha256tokeep = (__global uint8 *)(buff3 + (8 * sizeof(uint)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
uint nonce = (get_global_id(0));
uint data[20];
uint16 in;
uint8 state1, state2;
// uint8 sha256tokeep;
// ulong16 Bdev[8]; // will require an additional buffer
((uint16 *)data)[0] = ((__global const uint16 *)input)[0];
((uint4 *)data)[4] = ((__global const uint4 *)input)[4];
// for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); }
// if (nonce == 10) { printf("data %08x %08x\n", data[0], data[1]); }
uint8 passwd = sha256_80(data, nonce);
//pbkdf
in.lo = pad1.lo ^ passwd;
in.hi = pad1.hi;
state1 = sha256_Transform(in, H256);
in.lo = pad2.lo ^ passwd;
in.hi = pad2.hi;
state2 = sha256_Transform(in, H256);
in = ((uint16*)data)[0];
state1 = sha256_Transform(in, state1);
#pragma unroll 1
for (int i = 0; i<8; i++)
{
uint16 result;
in = pad3;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
in.s4 = 4 * i + 1;
in.lo = sha256_Transform(in, state1);
in.hi = pad4;
result.lo = swapvec(sha256_Transform(in, state2));
if (i == 0) sha256tokeep[0] = result.lo;
in = pad3;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
in.s4 = 4 * i + 2;
in.lo = sha256_Transform(in, state1);
in.hi = pad4;
result.hi = swapvec(sha256_Transform(in, state2));
Bdev[i].lo = as_ulong8(shuffle(result));
// Bdev[i].lo = as_ulong8(result);
in = pad3;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
in.s4 = 4 * i + 3;
in.lo = sha256_Transform(in, state1);
in.hi = pad4;
result.lo = swapvec(sha256_Transform(in, state2));
in = pad3;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
in.s4 = 4 * i + 4;
in.lo = sha256_Transform(in, state1);
in.hi = pad4;
result.hi = swapvec(sha256_Transform(in, state2));
Bdev[i].hi = as_ulong8(shuffle(result));
// Bdev[i].hi = as_ulong8(result);
}
//mixing1
prevstate[0] = Bdev[0];
Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
prevstate[1] = Bdev[0];
Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
uint n = 1;
#pragma unroll 1
for (uint i = 2; i < 64; i++)
{
prevstate[i] = Bdev[0];
if ((i&(i - 1)) == 0) n = n << 1;
uint j = as_uint2(Bdev[0].hi.s0).x & (n - 1);
j += i - n;
Bdev[0] ^= prevstate[j];
Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
}
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search1(__global uchar *buffer1, __global uchar *buffer2)
{
}
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search2(__global uchar *padcache, __global uchar *buff1, __global uchar *buff2)
{
__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16* prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
for (int i = 0; i<8; i++)
hashbuffer[i] = Bdev[i];
blockmix_pwxform((__global ulong8*)Bdev, prevstate);
for (int i = 0; i<8; i++)
hashbuffer[i + 8] = Bdev[i];
blockmix_pwxform((__global ulong8*)Bdev, prevstate);
int n = 1;
#pragma unroll 1
for (int i = 2; i < 2048; i ++)
{
for (int k = 0; k<8; k++)
(hashbuffer + 8 * i)[k] = Bdev[k];
if ((i&(i - 1)) == 0) n = n << 1;
uint j = as_uint2(Bdev[7].hi.s0).x & (n - 1);
j += i - n;
for (int k = 0; k < 8; k++)
Bdev[k] ^= (hashbuffer + 8 * j)[k];
blockmix_pwxform((__global ulong8*)Bdev, prevstate);
}
}
/*
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search3(__global uchar *buffer1, __global uchar *buffer2)
{
}
*/
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search3(__global uchar *padcache, __global uchar *buff1, __global uchar *buff2)
{
__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16* prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
#pragma unroll 1
for (int z = 0; z < 684; z++)
{
uint j = as_uint2(Bdev[7].hi.s0).x & 2047;
for (int k = 0; k < 8; k++)
Bdev[k] ^= (hashbuffer + 8 * j)[k];
if (z<682)
for (int k = 0; k<8; k++)
(hashbuffer + 8 * j)[k] = Bdev[k];
blockmix_pwxform((__global ulong8*)Bdev, prevstate);
////
}
}
/*
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search5(__global uchar *buffer1, __global uchar *buffer2)
{
}
*/
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search4(__global const uchar* restrict input, __global uint* restrict output, __global uchar *buff2,__global uchar* buff3, const uint target)
{
__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global uint8 *sha256tokeep = (__global uint8 *)(buff3 + (8 * sizeof(uint)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
uint nonce = (get_global_id(0));
uint data[20];
((uint16 *)data)[0] = ((__global const uint16 *)input)[0];
((uint4 *)data)[4] = ((__global const uint4 *)input)[4];
// for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); }
uint8 swpass = swapvec(sha256tokeep[0]);
uint16 in;
uint8 state1,state2;
in.lo = pad1.lo ^ swpass;
in.hi = pad1.hi;
state1 = sha256_Transform(in, H256);
in.lo = pad2.lo ^ swpass;
in.hi = pad2.hi;
state2 = sha256_Transform(in, H256);
#pragma unroll 1
for (int i = 0; i<8; i++) {
in = unshuffle(Bdev[i].lo);
in = swapvec16(in);
state1 = sha256_Transform(in, state1);
in = unshuffle(Bdev[i].hi);
in = swapvec16(in);
state1 = sha256_Transform(in, state1);
}
in = pad5;
state1 = sha256_Transform(in, state1);
in.lo = state1;
in.hi = pad4;
uint8 res = sha256_Transform(in, state2);
//hmac and final sha
in.lo = pad1.lo ^ res;
in.hi = pad1.hi;
state1 = sha256_Transform(in, H256);
in.lo = pad2.lo ^ res;
in.hi = pad2.hi;
state2 = sha256_Transform(in, H256);
in = ((uint16*)data)[0];
state1 = sha256_Transform(in, state1);
in = padsha80;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = get_global_id(0);
in.sf = 0x480;
state1 = sha256_Transform(in, state1);
in.lo = state1;
in.hi = pad4;
state1 = sha256_Transform(in, state2);
// state2 = H256;
in.lo = state1;
in.hi = pad4;
in.sf = 0x100;
res = sha256_Transform(in, H256);
if (SWAP32(res.s7) <= (target))
output[atomic_inc(output + 0xFF)] = (nonce);
}

253
kernel/yescrypt.cl

@ -0,0 +1,253 @@ @@ -0,0 +1,253 @@
/*
* "yescrypt" kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2015 djm34
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
#if !defined(cl_khr_byte_addressable_store)
#error "Device does not support unaligned stores"
#endif
#include "yescrypt_essential.cl"
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, __global uchar* buff1, __global uchar* buff2, const uint target)
{
__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
uint nonce = (get_global_id(0));
uint data[20];
uint16 in;
uint8 state1, state2;
uint8 sha256tokeep;
((uint16 *)data)[0] = ((__global const uint16 *)input)[0];
((uint4 *)data)[4] = ((__global const uint4 *)input)[4];
for (int i = 0; i<20; i++) { data[i] = SWAP32(data[i]); }
// if (nonce == 10) { printf("data %08x %08x\n", data[0], data[1]); }
uint8 passwd = sha256_80(data, nonce);
//pbkdf
in.lo = pad1.lo ^ passwd;
in.hi = pad1.hi;
state1 = sha256_Transform(in, H256);
in.lo = pad2.lo ^ passwd;
in.hi = pad2.hi;
state2 = sha256_Transform(in, H256);
in = ((uint16*)data)[0];
state1 = sha256_Transform(in, state1);
#pragma unroll 1
for (int i = 0; i<8; i++)
{
uint16 result;
in = pad3;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
in.s4 = 4 * i + 1;
in.lo = sha256_Transform(in, state1);
in.hi = pad4;
result.lo = swapvec(sha256_Transform(in, state2));
if (i == 0) sha256tokeep = result.lo;
in = pad3;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
in.s4 = 4 * i + 2;
in.lo = sha256_Transform(in, state1);
in.hi = pad4;
result.hi = swapvec(sha256_Transform(in, state2));
Bdev[i].lo = as_ulong8(shuffle(result));
in = pad3;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
in.s4 = 4 * i + 3;
in.lo = sha256_Transform(in, state1);
in.hi = pad4;
result.lo = swapvec(sha256_Transform(in, state2));
in = pad3;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
in.s4 = 4 * i + 4;
in.lo = sha256_Transform(in, state1);
in.hi = pad4;
result.hi = swapvec(sha256_Transform(in, state2));
Bdev[i].hi = as_ulong8(shuffle(result));
}
//mixing1
prevstate[0] = Bdev[0];
Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
prevstate[1] = Bdev[0];
Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
uint n = 1;
#pragma unroll 1
for (uint i = 2; i < 64; i++)
{
prevstate[i] = Bdev[0];
if ((i&(i - 1)) == 0) n = n << 1;
uint j = as_uint2(Bdev[0].hi.s0).x & (n - 1);
j += i - n;
Bdev[0] ^= prevstate[j];
Bdev[0] = blockmix_salsa8_small2(Bdev[0]);
}
for (int i = 0; i<8; i++)
hashbuffer[i] = Bdev[i];
blockmix_pwxform((__global ulong8*)Bdev, prevstate);
for (int i = 0; i<8; i++)
hashbuffer[i + 8] = Bdev[i];
blockmix_pwxform((__global ulong8*)Bdev, prevstate);
n = 1;
#pragma unroll 1
for (int i = 2; i < 2048; i++)
{
for (int k = 0; k<8; k++)
(hashbuffer + 8 * i)[k] = Bdev[k];
if ((i&(i - 1)) == 0) n = n << 1;
uint j = as_uint2(Bdev[7].hi.s0).x & (n - 1);
j += i - n;
for (int k = 0; k < 8; k++)
Bdev[k] ^= (hashbuffer + 8 * j)[k];
blockmix_pwxform((__global ulong8*)Bdev, prevstate);
}
#pragma unroll 1
for (int z = 0; z < 684; z++)
{
uint j = as_uint2(Bdev[7].hi.s0).x & 2047;
for (int k = 0; k < 8; k++)
Bdev[k] ^= (hashbuffer + 8 * j)[k];
if (z<682)
for (int k = 0; k<8; k++)
(hashbuffer + 8 * j)[k] = Bdev[k];
blockmix_pwxform((__global ulong8*)Bdev, prevstate);
////
}
uint8 swpass = swapvec(sha256tokeep);
// uint16 in;
// uint8 state1, state2;
in.lo = pad1.lo ^ swpass;
in.hi = pad1.hi;
state1 = sha256_Transform(in, H256);
in.lo = pad2.lo ^ swpass;
in.hi = pad2.hi;
state2 = sha256_Transform(in, H256);
#pragma unroll 1
for (int i = 0; i<8; i++) {
in = unshuffle(Bdev[i].lo);
in = swapvec16(in);
state1 = sha256_Transform(in, state1);
in = unshuffle(Bdev[i].hi);
in = swapvec16(in);
state1 = sha256_Transform(in, state1);
}
in = pad5;
state1 = sha256_Transform(in, state1);
in.lo = state1;
in.hi = pad4;
uint8 res = sha256_Transform(in, state2);
//hmac and final sha
in.lo = pad1.lo ^ res;
in.hi = pad1.hi;
state1 = sha256_Transform(in, H256);
in.lo = pad2.lo ^ res;
in.hi = pad2.hi;
state2 = sha256_Transform(in, H256);
in = ((uint16*)data)[0];
state1 = sha256_Transform(in, state1);
in = padsha80;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = get_global_id(0);
in.sf = 0x480;
state1 = sha256_Transform(in, state1);
in.lo = state1;
in.hi = pad4;
state1 = sha256_Transform(in, state2);
// state2 = H256;
in.lo = state1;
in.hi = pad4;
in.sf = 0x100;
res = sha256_Transform(in, H256);
if (SWAP32(res.s7) <= (target))
output[atomic_inc(output + 0xFF)] = (nonce);
}

760
kernel/yescrypt_essential.cl

@ -0,0 +1,760 @@ @@ -0,0 +1,760 @@
/*
* "yescrypt" kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2015 djm34
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author djm34
*/
#define ROL32(x, n) rotate(x, (uint) n)
#define SWAP32(a) (as_uint(as_uchar4(a).wzyx))
//#define ROL32(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define HASH_MEMORY 4096
#define SALSA(a,b,c,d) do { \
t =a+d; b^=ROL32(t, 7U); \
t =b+a; c^=ROL32(t, 9U); \
t =c+b; d^=ROL32(t, 13U); \
t =d+c; a^=ROL32(t, 18U); \
} while(0)
#define SALSA_CORE(state) do { \
\
SALSA(state.s0,state.s4,state.s8,state.sc); \
SALSA(state.s5,state.s9,state.sd,state.s1); \
SALSA(state.sa,state.se,state.s2,state.s6); \
SALSA(state.sf,state.s3,state.s7,state.sb); \
SALSA(state.s0,state.s1,state.s2,state.s3); \
SALSA(state.s5,state.s6,state.s7,state.s4); \
SALSA(state.sa,state.sb,state.s8,state.s9); \
SALSA(state.sf,state.sc,state.sd,state.se); \
} while(0)
#define uSALSA_CORE(state) do { \
\
SALSA(state.s0,state.s4,state.s8,state.sc); \
SALSA(state.s1,state.s5,state.s9,state.sd); \
SALSA(state.s2,state.s6,state.sa,state.se); \
SALSA(state.s3,state.s7,state.sb,state.sf); \
SALSA(state.s0,state.sd,state.sa,state.s7); \
SALSA(state.s1,state.se,state.sb,state.s4); \
SALSA(state.s2,state.sf,state.s8,state.s5); \
SALSA(state.s3,state.sc,state.s9,state.s6); \
} while(0)
#define unshuffle(state) (as_uint16(state).s0da741eb852fc963)
#define shuffle(state) (as_uint16(state).s05af49e38d27c16b)
static __constant uint16 pad1 =
{
0x36363636, 0x36363636, 0x36363636, 0x36363636,
0x36363636, 0x36363636, 0x36363636, 0x36363636,
0x36363636, 0x36363636, 0x36363636, 0x36363636,
0x36363636, 0x36363636, 0x36363636, 0x36363636
};
static __constant uint16 pad2 =
{
0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c,
0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c,
0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c,
0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c, 0x5c5c5c5c
};
static __constant uint16 pad5 =
{
0x00000001, 0x80000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00002220
};
static __constant uint16 pad3 =
{
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x80000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x000004a0
};
static __constant uint16 padsha80 =
{
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x80000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000280
};
static __constant uint8 pad4 =
{
0x80000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000300
};
static __constant uint8 H256 = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372,
0xA54FF53A, 0x510E527F, 0x9B05688C,
0x1F83D9AB, 0x5BE0CD19
};
inline uint8 swapvec(uint8 buf)
{
uint8 vec;
vec.s0 = SWAP32(buf.s0);
vec.s1 = SWAP32(buf.s1);
vec.s2 = SWAP32(buf.s2);
vec.s3 = SWAP32(buf.s3);
vec.s4 = SWAP32(buf.s4);
vec.s5 = SWAP32(buf.s5);
vec.s6 = SWAP32(buf.s6);
vec.s7 = SWAP32(buf.s7);
return vec;
}
inline uint16 swapvec16(uint16 buf)
{
uint16 vec;
vec.s0 = SWAP32(buf.s0);
vec.s1 = SWAP32(buf.s1);
vec.s2 = SWAP32(buf.s2);
vec.s3 = SWAP32(buf.s3);
vec.s4 = SWAP32(buf.s4);
vec.s5 = SWAP32(buf.s5);
vec.s6 = SWAP32(buf.s6);
vec.s7 = SWAP32(buf.s7);
vec.s8 = SWAP32(buf.s8);
vec.s9 = SWAP32(buf.s9);
vec.sa = SWAP32(buf.sa);
vec.sb = SWAP32(buf.sb);
vec.sc = SWAP32(buf.sc);
vec.sd = SWAP32(buf.sd);
vec.se = SWAP32(buf.se);
vec.sf = SWAP32(buf.sf);
return vec;
}
ulong8 salsa20_8(uint16 Bx)
{
uint t;
uint16 st = Bx;
uSALSA_CORE(st);
uSALSA_CORE(st);
uSALSA_CORE(st);
uSALSA_CORE(st);
return(as_ulong8(st + Bx));
}
ulong8 salsa20_8n(uint16 Bx)
{
uint t;
uint16 st = Bx;
SALSA_CORE(st);
SALSA_CORE(st);
SALSA_CORE(st);
SALSA_CORE(st);
return(as_ulong8(st + Bx));
}
ulong16 blockmix_salsa8_small2(ulong16 Bin)
{
ulong8 X = Bin.hi;
X ^= Bin.lo;
X = salsa20_8(as_uint16(X));
Bin.lo = X;
X ^= Bin.hi;
X = salsa20_8(as_uint16(X));
Bin.hi = X;
return(Bin);
}
/*
uint16 salsa20_8_2(uint16 Bx)
{
uint t;
uint16 st = Bx;
uSALSA_CORE(st);
uSALSA_CORE(st);
uSALSA_CORE(st);
uSALSA_CORE(st);
return(st + Bx);
}
ulong16 blockmix_salsa8_small2(ulong16 Bin)
{
uint16 X = as_uint16(Bin.hi);
X ^= as_uint16(Bin.lo);
X = salsa20_8_2(as_uint16(X));
Bin.lo = as_ulong8(X);
X ^= as_uint16(Bin.hi);
X = salsa20_8_2(as_uint16(X));
Bin.hi = as_ulong8(X);
return(Bin);
}
*/
inline ulong2 madd4long2(uint4 a, uint4 b)
{
uint4 result;
result.x = a.x*a.y + b.x;
result.y = b.y + mad_hi(a.x, a.y, b.x);
result.z = a.z*a.w + b.z;
result.w = b.w + mad_hi(a.z, a.w, b.z);
return as_ulong2(result);
}
inline ulong2 madd4long3(uint4 a, ulong2 b)
{
ulong2 result;
result.x = (ulong)a.x*(ulong)a.y + b.x;
result.y = (ulong)a.z*(ulong)a.w + b.y;
return result;
}
inline ulong8 block_pwxform_long_old(ulong8 Bout, __global ulong16 *prevstate)
{
ulong2 vec = Bout.lo.lo;
for (int i = 0; i < 6; i++)
{
ulong2 p0, p1;
uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
p0 = ((__global ulong2*)(prevstate ))[x.x];
vec = madd4long3(as_uint4(vec), p0);
p1 = ((__global ulong2*)(prevstate + 32))[x.y];
vec ^= p1;
}
Bout.lo.lo = vec;
vec = Bout.lo.hi;
for (int i = 0; i < 6; i++)
{
ulong2 p0, p1;
uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
p0 = ((__global ulong2*)(prevstate))[x.x];
vec = madd4long3(as_uint4(vec), p0);
p1 = ((__global ulong2*)(prevstate + 32))[x.y];
vec ^= p1;
}
Bout.lo.hi = vec;
vec = Bout.hi.lo;
for (int i = 0; i < 6; i++)
{
ulong2 p0, p1;
uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
p0 = ((__global ulong2*)(prevstate))[x.x];
vec = madd4long3(as_uint4(vec), p0);
p1 = ((__global ulong2*)(prevstate + 32))[x.y];
vec ^= p1;
}
Bout.hi.lo = vec;
vec = Bout.hi.hi;
for (int i = 0; i < 6; i++)
{
ulong2 p0, p1;
uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
p0 = ((__global ulong2*)(prevstate))[x.x];
vec = madd4long3(as_uint4(vec), p0);
p1 = ((__global ulong2*)(prevstate + 32))[x.y];
vec ^= p1;
}
Bout.hi.hi = vec;
return(Bout);
}
inline ulong8 block_pwxform_long(ulong8 Bout, __global ulong2 *prevstate)
{
ulong2 vec = Bout.lo.lo;
for (int i = 0; i < 6; i++)
{
ulong2 p0, p1;
uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
p0 = prevstate[x.x];
vec = madd4long3(as_uint4(vec), p0);
p1 = (prevstate + 32*8)[x.y];
vec ^= p1;
}
Bout.lo.lo = vec;
vec = Bout.lo.hi;
for (int i = 0; i < 6; i++)
{
ulong2 p0, p1;
uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
p0 = prevstate[x.x];
vec = madd4long3(as_uint4(vec), p0);
p1 = (prevstate + 32 * 8)[x.y];
vec ^= p1;
}
Bout.lo.hi = vec;
vec = Bout.hi.lo;
for (int i = 0; i < 6; i++)
{
ulong2 p0, p1;
uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
p0 = prevstate[x.x];
vec = madd4long3(as_uint4(vec), p0);
p1 = (prevstate + 32 * 8)[x.y];
vec ^= p1;
}
Bout.hi.lo = vec;
vec = Bout.hi.hi;
for (int i = 0; i < 6; i++)
{
ulong2 p0, p1;
uint2 x = as_uint2((vec.x >> 4) & 0x000000FF000000FF);
p0 = prevstate[x.x];
vec = madd4long3(as_uint4(vec), p0);
p1 = (prevstate + 32 * 8)[x.y];
vec ^= p1;
}
Bout.hi.hi = vec;
return(Bout);
}
inline void blockmix_pwxform(__global ulong8 *Bin, __global ulong16 *prevstate)
{
Bin[0] ^= Bin[15];
Bin[0] = block_pwxform_long_old(Bin[0], prevstate);
#pragma unroll 1
for (int i = 1; i < 16; i++)
{
Bin[i] ^= Bin[i - 1];
Bin[i] = block_pwxform_long_old(Bin[i], prevstate);
}
Bin[15] = salsa20_8(as_uint16(Bin[15]));
}
#define SHR(x, n) ((x) >> n)
#define S0(x) (ROL32(x, 25) ^ ROL32(x, 14) ^ SHR(x, 3))
#define S1(x) (ROL32(x, 15) ^ ROL32(x, 13) ^ SHR(x, 10))
#define S2(x) (ROL32(x, 30) ^ ROL32(x, 19) ^ ROL32(x, 10))
#define S3(x) (ROL32(x, 26) ^ ROL32(x, 21) ^ ROL32(x, 7))
#define P(a,b,c,d,e,f,g,h,x,K) \
{ \
temp1 = h + S3(e) + F1(e,f,g) + (K + x); \
d += temp1; h = temp1 + S2(a) + F0(a,b,c); \
}
#define PLAST(a,b,c,d,e,f,g,h,x,K) \
{ \
d += h + S3(e) + F1(e,f,g) + (x + K); \
}
#define F0(y, x, z) bitselect(z, y, z ^ x)
#define F1(x, y, z) bitselect(z, y, x)
#define R0 (W0 = S1(W14) + W9 + S0(W1) + W0)
#define R1 (W1 = S1(W15) + W10 + S0(W2) + W1)
#define R2 (W2 = S1(W0) + W11 + S0(W3) + W2)
#define R3 (W3 = S1(W1) + W12 + S0(W4) + W3)
#define R4 (W4 = S1(W2) + W13 + S0(W5) + W4)
#define R5 (W5 = S1(W3) + W14 + S0(W6) + W5)
#define R6 (W6 = S1(W4) + W15 + S0(W7) + W6)
#define R7 (W7 = S1(W5) + W0 + S0(W8) + W7)
#define R8 (W8 = S1(W6) + W1 + S0(W9) + W8)
#define R9 (W9 = S1(W7) + W2 + S0(W10) + W9)
#define R10 (W10 = S1(W8) + W3 + S0(W11) + W10)
#define R11 (W11 = S1(W9) + W4 + S0(W12) + W11)
#define R12 (W12 = S1(W10) + W5 + S0(W13) + W12)
#define R13 (W13 = S1(W11) + W6 + S0(W14) + W13)
#define R14 (W14 = S1(W12) + W7 + S0(W15) + W14)
#define R15 (W15 = S1(W13) + W8 + S0(W0) + W15)
#define RD14 (S1(W12) + W7 + S0(W15) + W14)
#define RD15 (S1(W13) + W8 + S0(W0) + W15)
/// generic sha transform
inline uint8 sha256_Transform(uint16 data, uint8 state)
{
uint temp1;
uint8 res = state;
uint W0 = data.s0;
uint W1 = data.s1;
uint W2 = data.s2;
uint W3 = data.s3;
uint W4 = data.s4;
uint W5 = data.s5;
uint W6 = data.s6;
uint W7 = data.s7;
uint W8 = data.s8;
uint W9 = data.s9;
uint W10 = data.sA;
uint W11 = data.sB;
uint W12 = data.sC;
uint W13 = data.sD;
uint W14 = data.sE;
uint W15 = data.sF;
#define v0 res.s0
#define v1 res.s1
#define v2 res.s2
#define v3 res.s3
#define v4 res.s4
#define v5 res.s5
#define v6 res.s6
#define v7 res.s7
P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98);
P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491);
P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF);
P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5);
P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B);
P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1);
P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4);
P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5);
P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98);
P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01);
P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE);
P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3);
P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74);
P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE);
P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7);
P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147);
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351);
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624);
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585);
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB);
P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7);
P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2);
#undef v0
#undef v1
#undef v2
#undef v3
#undef v4
#undef v5
#undef v6
#undef v7
return (res+state);
}
static inline uint8 sha256_round1(uint16 data)
{
uint temp1;
uint8 res;
uint W0 = data.s0;
uint W1 = data.s1;
uint W2 = data.s2;
uint W3 = data.s3;
uint W4 = data.s4;
uint W5 = data.s5;
uint W6 = data.s6;
uint W7 = data.s7;
uint W8 = data.s8;
uint W9 = data.s9;
uint W10 = data.sA;
uint W11 = data.sB;
uint W12 = data.sC;
uint W13 = data.sD;
uint W14 = data.sE;
uint W15 = data.sF;
uint v0 = 0x6A09E667;
uint v1 = 0xBB67AE85;
uint v2 = 0x3C6EF372;
uint v3 = 0xA54FF53A;
uint v4 = 0x510E527F;
uint v5 = 0x9B05688C;
uint v6 = 0x1F83D9AB;
uint v7 = 0x5BE0CD19;
P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98);
P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491);
P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF);
P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5);
P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B);
P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1);
P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4);
P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5);
P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98);
P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01);
P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE);
P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3);
P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74);
P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE);
P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7);
P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147);
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351);
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624);
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585);
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB);
P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7);
P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2);
res.s0 = v0 + 0x6A09E667;
res.s1 = v1 + 0xBB67AE85;
res.s2 = v2 + 0x3C6EF372;
res.s3 = v3 + 0xA54FF53A;
res.s4 = v4 + 0x510E527F;
res.s5 = v5 + 0x9B05688C;
res.s6 = v6 + 0x1F83D9AB;
res.s7 = v7 + 0x5BE0CD19;
return (res);
}
static inline uint8 sha256_round2(uint16 data,uint8 buf)
{
uint temp1;
uint8 res;
uint W0 = data.s0;
uint W1 = data.s1;
uint W2 = data.s2;
uint W3 = data.s3;
uint W4 = data.s4;
uint W5 = data.s5;
uint W6 = data.s6;
uint W7 = data.s7;
uint W8 = data.s8;
uint W9 = data.s9;
uint W10 = data.sA;
uint W11 = data.sB;
uint W12 = data.sC;
uint W13 = data.sD;
uint W14 = data.sE;
uint W15 = data.sF;
uint v0 = buf.s0;
uint v1 = buf.s1;
uint v2 = buf.s2;
uint v3 = buf.s3;
uint v4 = buf.s4;
uint v5 = buf.s5;
uint v6 = buf.s6;
uint v7 = buf.s7;
P(v0, v1, v2, v3, v4, v5, v6, v7, W0, 0x428A2F98);
P(v7, v0, v1, v2, v3, v4, v5, v6, W1, 0x71374491);
P(v6, v7, v0, v1, v2, v3, v4, v5, W2, 0xB5C0FBCF);
P(v5, v6, v7, v0, v1, v2, v3, v4, W3, 0xE9B5DBA5);
P(v4, v5, v6, v7, v0, v1, v2, v3, W4, 0x3956C25B);
P(v3, v4, v5, v6, v7, v0, v1, v2, W5, 0x59F111F1);
P(v2, v3, v4, v5, v6, v7, v0, v1, W6, 0x923F82A4);
P(v1, v2, v3, v4, v5, v6, v7, v0, W7, 0xAB1C5ED5);
P(v0, v1, v2, v3, v4, v5, v6, v7, W8, 0xD807AA98);
P(v7, v0, v1, v2, v3, v4, v5, v6, W9, 0x12835B01);
P(v6, v7, v0, v1, v2, v3, v4, v5, W10, 0x243185BE);
P(v5, v6, v7, v0, v1, v2, v3, v4, W11, 0x550C7DC3);
P(v4, v5, v6, v7, v0, v1, v2, v3, W12, 0x72BE5D74);
P(v3, v4, v5, v6, v7, v0, v1, v2, W13, 0x80DEB1FE);
P(v2, v3, v4, v5, v6, v7, v0, v1, W14, 0x9BDC06A7);
P(v1, v2, v3, v4, v5, v6, v7, v0, W15, 0xC19BF174);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0xE49B69C1);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0xEFBE4786);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x0FC19DC6);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x240CA1CC);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x2DE92C6F);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4A7484AA);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5CB0A9DC);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x76F988DA);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x983E5152);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA831C66D);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xB00327C8);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xBF597FC7);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xC6E00BF3);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD5A79147);
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0x06CA6351);
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x14292967);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x27B70A85);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x2E1B2138);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x4D2C6DFC);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x53380D13);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x650A7354);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x766A0ABB);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x81C2C92E);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x92722C85);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0xA2BFE8A1);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0xA81A664B);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0xC24B8B70);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0xC76C51A3);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0xD192E819);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xD6990624);
P(v2, v3, v4, v5, v6, v7, v0, v1, R14, 0xF40E3585);
P(v1, v2, v3, v4, v5, v6, v7, v0, R15, 0x106AA070);
P(v0, v1, v2, v3, v4, v5, v6, v7, R0, 0x19A4C116);
P(v7, v0, v1, v2, v3, v4, v5, v6, R1, 0x1E376C08);
P(v6, v7, v0, v1, v2, v3, v4, v5, R2, 0x2748774C);
P(v5, v6, v7, v0, v1, v2, v3, v4, R3, 0x34B0BCB5);
P(v4, v5, v6, v7, v0, v1, v2, v3, R4, 0x391C0CB3);
P(v3, v4, v5, v6, v7, v0, v1, v2, R5, 0x4ED8AA4A);
P(v2, v3, v4, v5, v6, v7, v0, v1, R6, 0x5B9CCA4F);
P(v1, v2, v3, v4, v5, v6, v7, v0, R7, 0x682E6FF3);
P(v0, v1, v2, v3, v4, v5, v6, v7, R8, 0x748F82EE);
P(v7, v0, v1, v2, v3, v4, v5, v6, R9, 0x78A5636F);
P(v6, v7, v0, v1, v2, v3, v4, v5, R10, 0x84C87814);
P(v5, v6, v7, v0, v1, v2, v3, v4, R11, 0x8CC70208);
P(v4, v5, v6, v7, v0, v1, v2, v3, R12, 0x90BEFFFA);
P(v3, v4, v5, v6, v7, v0, v1, v2, R13, 0xA4506CEB);
P(v2, v3, v4, v5, v6, v7, v0, v1, RD14, 0xBEF9A3F7);
P(v1, v2, v3, v4, v5, v6, v7, v0, RD15, 0xC67178F2);
res.s0 = (v0 + buf.s0);
res.s1 = (v1 + buf.s1);
res.s2 = (v2 + buf.s2);
res.s3 = (v3 + buf.s3);
res.s4 = (v4 + buf.s4);
res.s5 = (v5 + buf.s5);
res.s6 = (v6 + buf.s6);
res.s7 = (v7 + buf.s7);
return (res);
}
static inline uint8 sha256_80(uint* data,uint nonce)
{
uint8 buf = sha256_round1( ((uint16*)data)[0]);
uint16 in = padsha80;
in.s0 = data[16];
in.s1 = data[17];
in.s2 = data[18];
in.s3 = nonce;
return(sha256_round2(in,buf));
}

27
miner.h

@ -733,6 +733,17 @@ static inline void flip128(void *dest_p, const void *src_p) @@ -733,6 +733,17 @@ static inline void flip128(void *dest_p, const void *src_p)
dest[i] = swab32(src[i]);
}
static inline void flip168(void *dest_p, const void *src_p)
{
uint32_t *dest = (uint32_t *)dest_p;
const uint32_t *src = (uint32_t *)src_p;
int i;
for (i = 0; i < 42; i++)
dest[i] = swab32(src[i]);
}
/* For flipping to the correct endianness if necessary */
#if defined(__BIG_ENDIAN__) || defined(MIPSEB)
static inline void endian_flip32(void *dest_p, const void *src_p)
@ -744,6 +755,11 @@ static inline void endian_flip128(void *dest_p, const void *src_p) @@ -744,6 +755,11 @@ static inline void endian_flip128(void *dest_p, const void *src_p)
{
flip128(dest_p, src_p);
}
static inline void endian_flip168(void *dest_p, const void *src_p)
{
flip168(dest_p, src_p);
}
#else
static inline void
endian_flip32(void __maybe_unused *dest_p, const void __maybe_unused *src_p)
@ -754,8 +770,13 @@ static inline void @@ -754,8 +770,13 @@ static inline void
endian_flip128(void __maybe_unused *dest_p, const void __maybe_unused *src_p)
{
}
static inline void
endian_flip168(void __maybe_unused *dest_p, const void __maybe_unused *src_p)
{
}
#endif
extern double cgpu_runtime(struct cgpu_info *cgpu);
extern void _quit(int status);
@ -1146,8 +1167,8 @@ extern bool add_pool_details(struct pool *pool, bool live, char *url, char *user @@ -1146,8 +1167,8 @@ extern bool add_pool_details(struct pool *pool, bool live, char *url, char *user
#define MAX_GPUDEVICES 16
#define MAX_DEVICES 4096
#define MIN_INTENSITY 8
#define MIN_INTENSITY_STR "8"
#define MIN_INTENSITY 4
#define MIN_INTENSITY_STR "4"
#define MAX_INTENSITY 31
#define MAX_INTENSITY_STR "31"
#define MIN_XINTENSITY 1
@ -1416,7 +1437,7 @@ struct pool { @@ -1416,7 +1437,7 @@ struct pool {
#define GETWORK_MODE_GBT 'G'
struct work {
unsigned char data[128];
unsigned char data[168];
unsigned char midstate[32];
unsigned char target[32];
unsigned char hash[32];

242
ocl.c

@ -36,6 +36,8 @@ @@ -36,6 +36,8 @@
#include "ocl/binary_kernel.h"
#include "algorithm/neoscrypt.h"
#include "algorithm/pluck.h"
#include "algorithm/yescrypt.h"
#include "algorithm/lyra2re.h"
/* FIXME: only here for global config vars, replace with configuration.h
* or similar as soon as config is in a struct instead of littered all
@ -414,8 +416,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -414,8 +416,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
}
/////////////////////////////////// pluck
// neoscrypt TC
// pluck TC
else if (!safe_cmp(cgpu->algorithm.name, "pluck") && !cgpu->opt_tc) {
size_t glob_thread_count;
long max_int;
@ -497,7 +498,175 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -497,7 +498,175 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
cgpu->thread_concurrency = glob_thread_count;
applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
}
// Yescrypt TC
else if ((!safe_cmp(cgpu->algorithm.name, "yescrypt") ||
!safe_cmp(algorithm->name, "yescrypt-multi")) && !cgpu->opt_tc) {
size_t glob_thread_count;
long max_int;
unsigned char type = 0;
// determine which intensity type to use
if (cgpu->rawintensity > 0) {
glob_thread_count = cgpu->rawintensity;
max_int = glob_thread_count;
type = 2;
}
else if (cgpu->xintensity > 0) {
glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity);
max_int = cgpu->xintensity;
type = 1;
}
else {
glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity);
max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
}
glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count);
// if TC * scratchbuf size is too big for memory... reduce to max
if ((glob_thread_count * YESCRYPT_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) {
/* Selected intensity will not run on this GPU. Not enough memory.
* Adapt the memory setting. */
// depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc
switch (type) {
//raw intensity
case 2:
while ((glob_thread_count * YESCRYPT_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) {
--glob_thread_count;
}
max_int = glob_thread_count;
cgpu->rawintensity = glob_thread_count;
break;
//x intensity
case 1:
glob_thread_count = cgpu->max_alloc / YESCRYPT_SCRATCHBUF_SIZE;
max_int = glob_thread_count / clState->compute_shaders;
while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) {
--max_int;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_XINTENSITY) {
applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu);
max_int = MIN_XINTENSITY;
}
cgpu->xintensity = max_int;
glob_thread_count = clState->compute_shaders * (1UL << max_int);
break;
default:
glob_thread_count = cgpu->max_alloc / YESCRYPT_SCRATCHBUF_SIZE;
while (max_int && ((1UL << max_int) & glob_thread_count) == 0) {
--max_int;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_INTENSITY) {
applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
max_int = MIN_INTENSITY;
}
cgpu->intensity = max_int;
glob_thread_count = 1UL << max_int;
break;
}
}
// TC is glob thread count
cgpu->thread_concurrency = glob_thread_count;
applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
}
// Lyra2re v2 TC
else if ( !safe_cmp(cgpu->algorithm.name, "lyra2REv2") ) {
size_t glob_thread_count;
long max_int;
unsigned char type = 0;
// determine which intensity type to use
if (cgpu->rawintensity > 0) {
glob_thread_count = cgpu->rawintensity;
max_int = glob_thread_count;
type = 2;
}
else if (cgpu->xintensity > 0) {
glob_thread_count = clState->compute_shaders * ((cgpu->algorithm.xintensity_shift) ? (1UL << (cgpu->algorithm.xintensity_shift + cgpu->xintensity)) : cgpu->xintensity);
max_int = cgpu->xintensity;
type = 1;
}
else {
glob_thread_count = 1UL << (cgpu->algorithm.intensity_shift + cgpu->intensity);
max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
}
glob_thread_count = ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count);
// if TC * scratchbuf size is too big for memory... reduce to max
if ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) >= (uint64_t)cgpu->max_alloc) {
/* Selected intensity will not run on this GPU. Not enough memory.
* Adapt the memory setting. */
// depending on intensity type used, reduce the intensity until it fits into the GPU max_alloc
switch (type) {
//raw intensity
case 2:
while ((glob_thread_count * LYRA_SCRATCHBUF_SIZE) > (uint64_t)cgpu->max_alloc) {
--glob_thread_count;
}
max_int = glob_thread_count;
cgpu->rawintensity = glob_thread_count;
break;
//x intensity
case 1:
glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE;
max_int = glob_thread_count / clState->compute_shaders;
while (max_int && ((clState->compute_shaders * (1UL << max_int)) > glob_thread_count)) {
--max_int;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_XINTENSITY) {
applog(LOG_ERR, "GPU %d: Max xintensity is below minimum.", gpu);
max_int = MIN_XINTENSITY;
}
cgpu->xintensity = max_int;
glob_thread_count = clState->compute_shaders * (1UL << max_int);
break;
default:
glob_thread_count = cgpu->max_alloc / LYRA_SCRATCHBUF_SIZE;
while (max_int && ((1UL << max_int) & glob_thread_count) == 0) {
--max_int;
}
/* Check if max_intensity is >0. */
if (max_int < MIN_INTENSITY) {
applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
max_int = MIN_INTENSITY;
}
cgpu->intensity = max_int;
glob_thread_count = 1UL << max_int;
break;
}
}
// TC is glob thread count
cgpu->thread_concurrency = glob_thread_count;
applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
}
else if (!cgpu->opt_tc) {
unsigned int sixtyfours;
@ -586,7 +755,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -586,7 +755,10 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
}
size_t bufsize;
size_t readbufsize = 128;
size_t buf1size;
size_t buf3size;
size_t buf2size;
size_t readbufsize = (!safe_cmp(algorithm->name, "credits")) ? 168 : 128;
if (algorithm->rw_buffer_size < 0) {
// calc buffer size for neoscrypt
@ -612,6 +784,31 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -612,6 +784,31 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
applog(LOG_DEBUG, "pluck buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
// scrypt/n-scrypt
}
else if (!safe_cmp(algorithm->name, "yescrypt") || !safe_cmp(algorithm->name, "yescrypt-multi")) {
/* The scratch/pad-buffer needs 32kBytes memory per thread. */
bufsize = YESCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency;
buf1size = PLUCK_SECBUF_SIZE * cgpu->thread_concurrency;
buf2size = 128 * 8 * 8 * cgpu->thread_concurrency;
buf3size= 8 * 8 * 4 * cgpu->thread_concurrency;
/* This is the input buffer. For yescrypt this is guaranteed to be
* 80 bytes only. */
readbufsize = 80;
applog(LOG_DEBUG, "yescrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
// scrypt/n-scrypt
}
else if (!safe_cmp(algorithm->name, "lyra2REv2") ) {
/* The scratch/pad-buffer needs 32kBytes memory per thread. */
bufsize = LYRA_SCRATCHBUF_SIZE * cgpu->thread_concurrency;
buf1size = 4* 8 * cgpu->thread_concurrency; //matrix
/* This is the input buffer. For yescrypt this is guaranteed to be
* 80 bytes only. */
readbufsize = 80;
applog(LOG_DEBUG, "lyra2REv2 buffer sizes: %lu RW, %lu RW", (unsigned long)bufsize, (unsigned long)buf1size);
// scrypt/n-scrypt
}
else {
size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0));
bufsize = 128 * ipt * cgpu->thread_concurrency;
@ -624,6 +821,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -624,6 +821,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
}
clState->padbuffer8 = NULL;
clState->buffer1 = NULL;
clState->buffer2 = NULL;
clState->buffer3 = NULL;
if (bufsize > 0) {
applog(LOG_DEBUG, "Creating read/write buffer sized %lu", (unsigned long)bufsize);
@ -635,6 +835,42 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg @@ -635,6 +835,42 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize);
}
if (!safe_cmp(algorithm->name, "yescrypt") || !safe_cmp(algorithm->name, "yescrypt-multi")) {
// need additionnal buffers
clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
if (status != CL_SUCCESS && !clState->buffer1) {
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
return NULL;
}
clState->buffer2 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf2size, NULL, &status);
if (status != CL_SUCCESS && !clState->buffer2) {
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer2), decrease TC or increase LG", status);
return NULL;
}
clState->buffer3 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf3size, NULL, &status);
if (status != CL_SUCCESS && !clState->buffer3) {
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer3), decrease TC or increase LG", status);
return NULL;
}
}
else if (!safe_cmp(algorithm->name, "lyra2REv2") ) {
// need additionnal buffers
clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, buf1size, NULL, &status);
if (status != CL_SUCCESS && !clState->buffer1) {
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
return NULL;
}
}
else {
clState->buffer1 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); // we don't need that much just tired...
if (status != CL_SUCCESS && !clState->buffer1) {
applog(LOG_DEBUG, "Error %d: clCreateBuffer (buffer1), decrease TC or increase LG", status);
return NULL;
}
}
/* This buffer is weird and might work to some degree even if
* the create buffer call has apparently failed, so check if we
* get anything back before we call it a failure. */

5
ocl.h

@ -23,7 +23,10 @@ typedef struct __clState { @@ -23,7 +23,10 @@ typedef struct __clState {
cl_mem CLbuffer0;
cl_mem MidstateBuf;
cl_mem padbuffer8;
unsigned char cldata[80];
cl_mem buffer1;
cl_mem buffer2;
cl_mem buffer3;
unsigned char cldata[168];
bool goffset;
cl_uint vwidth;
size_t max_work_size;

26
sgminer.c

@ -1919,6 +1919,7 @@ static void calc_midstate(struct work *work) @@ -1919,6 +1919,7 @@ static void calc_midstate(struct work *work)
endian_flip32(work->midstate, work->midstate);
}
static struct work *make_work(void)
{
struct work *w = (struct work *)calloc(1, sizeof(struct work));
@ -2260,7 +2261,9 @@ static bool gbt_decode(struct pool *pool, json_t *res_val) @@ -2260,7 +2261,9 @@ static bool gbt_decode(struct pool *pool, json_t *res_val)
static bool getwork_decode(json_t *res_val, struct work *work)
{
if (unlikely(!jobj_binary(res_val, "data", work->data, sizeof(work->data), true))) {
size_t worklen = 128;
worklen = ((!safe_cmp(work->pool->algorithm.name, "credits")) ? sizeof(work->data) : worklen);
if (unlikely(!jobj_binary(res_val, "data", work->data, worklen, true))) {
if (opt_morenotices)
applog(LOG_ERR, "%s: JSON inval data", isnull(get_pool_name(work->pool), ""));
return false;
@ -3018,10 +3021,17 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s @@ -3018,10 +3021,17 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s
cgpu = get_thr_cgpu(thr_id);
endian_flip128(work->data, work->data);
if (safe_cmp(work->pool->algorithm.name, "credits")) {
endian_flip128(work->data, work->data);
} else {
endian_flip168(work->data, work->data);
}
/* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */
hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data)));
int datasize = 128;
if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) datasize = 80;
else if (!safe_cmp(work->pool->algorithm.name, "credits")) datasize = 168;
hexstr = bin2hex(work->data, datasize);
/* build JSON-RPC request */
if (work->gbt) {
@ -7060,7 +7070,10 @@ void inc_hw_errors(struct thr_info *thr) @@ -7060,7 +7070,10 @@ void inc_hw_errors(struct thr_info *thr)
/* Fills in the work nonce and builds the output data in work->hash */
static void rebuild_nonce(struct work *work, uint32_t nonce)
{
uint32_t *work_nonce = (uint32_t *)(work->data + 76);
uint32_t nonce_pos = 76;
if (!safe_cmp(work->pool->algorithm.name, "credits")) nonce_pos = 140;
uint32_t *work_nonce = (uint32_t *)(work->data + nonce_pos);
*work_nonce = htole32(nonce);
@ -7076,7 +7089,10 @@ bool test_nonce(struct work *work, uint32_t nonce) @@ -7076,7 +7089,10 @@ bool test_nonce(struct work *work, uint32_t nonce)
rebuild_nonce(work, nonce);
// for Neoscrypt, the diff1targ value is in work->target
if (!safe_cmp(work->pool->algorithm.name, "neoscrypt") || !safe_cmp(work->pool->algorithm.name, "pluck")) {
if (!safe_cmp(work->pool->algorithm.name, "neoscrypt") || !safe_cmp(work->pool->algorithm.name, "pluck")
|| !safe_cmp(work->pool->algorithm.name, "yescrypt")
|| !safe_cmp(work->pool->algorithm.name, "yescrypt-multi")
) {
diff1targ = ((uint32_t *)work->target)[7];
}
else {

2
sph/Makefile.am

@ -1,3 +1,3 @@ @@ -1,3 +1,3 @@
noinst_LIBRARIES = libsph.a
libsph_a_SOURCES = bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c
libsph_a_SOURCES = bmw.c echo.c jh.c luffa.c simd.c blake.c cubehash.c groestl.c keccak.c shavite.c skein.c sha2.c sha2big.c fugue.c hamsi.c panama.c shabal.c whirlpool.c sha256_Y.c

418
sph/sha256_Y.c

@ -0,0 +1,418 @@ @@ -0,0 +1,418 @@
/*-
* Copyright 2005,2007,2009 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/types.h>
#include <stdint.h>
#include <string.h>
#include "algorithm/sysendian.h"
#include "sph/sha256_Y.h"
/*
* Encode a length len/4 vector of (uint32_t) into a length len vector of
* (unsigned char) in big-endian form. Assumes len is a multiple of 4.
*/
static void
be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
{
size_t i;
for (i = 0; i < len / 4; i++)
be32enc(dst + i * 4, src[i]);
}
/*
* Decode a big-endian length len vector of (unsigned char) into a length
* len/4 vector of (uint32_t). Assumes len is a multiple of 4.
*/
static void
be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
{
size_t i;
for (i = 0; i < len / 4; i++)
dst[i] = be32dec(src + i * 4);
}
/* Elementary functions used by SHA256 */
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define SHR(x, n) (x >> n)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
/* SHA256 round function */
#define RND(a, b, c, d, e, f, g, h, k) \
t0 = h + S1(e) + Ch(e, f, g) + k; \
t1 = S0(a) + Maj(a, b, c); \
d += t0; \
h = t0 + t1;
/* Adjusted round function for rotating state */
#define RNDr(S, W, i, k) \
RND(S[(64 - i) % 8], S[(65 - i) % 8], \
S[(66 - i) % 8], S[(67 - i) % 8], \
S[(68 - i) % 8], S[(69 - i) % 8], \
S[(70 - i) % 8], S[(71 - i) % 8], \
W[i] + k)
/*
* SHA256 block compression function. The 256-bit state is transformed via
* the 512-bit input block to produce a new state.
*/
static void
SHA256_Transform(uint32_t * state, const unsigned char block[64])
{
uint32_t W[64];
uint32_t S[8];
uint32_t t0, t1;
int i;
/* 1. Prepare message schedule W. */
be32dec_vect(W, block, 64);
for (i = 16; i < 64; i++)
W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
/* 2. Initialize working variables. */
memcpy(S, state, 32);
/* 3. Mix. */
RNDr(S, W, 0, 0x428a2f98);
RNDr(S, W, 1, 0x71374491);
RNDr(S, W, 2, 0xb5c0fbcf);
RNDr(S, W, 3, 0xe9b5dba5);
RNDr(S, W, 4, 0x3956c25b);
RNDr(S, W, 5, 0x59f111f1);
RNDr(S, W, 6, 0x923f82a4);
RNDr(S, W, 7, 0xab1c5ed5);
RNDr(S, W, 8, 0xd807aa98);
RNDr(S, W, 9, 0x12835b01);
RNDr(S, W, 10, 0x243185be);
RNDr(S, W, 11, 0x550c7dc3);
RNDr(S, W, 12, 0x72be5d74);
RNDr(S, W, 13, 0x80deb1fe);
RNDr(S, W, 14, 0x9bdc06a7);
RNDr(S, W, 15, 0xc19bf174);
RNDr(S, W, 16, 0xe49b69c1);
RNDr(S, W, 17, 0xefbe4786);
RNDr(S, W, 18, 0x0fc19dc6);
RNDr(S, W, 19, 0x240ca1cc);
RNDr(S, W, 20, 0x2de92c6f);
RNDr(S, W, 21, 0x4a7484aa);
RNDr(S, W, 22, 0x5cb0a9dc);
RNDr(S, W, 23, 0x76f988da);
RNDr(S, W, 24, 0x983e5152);
RNDr(S, W, 25, 0xa831c66d);
RNDr(S, W, 26, 0xb00327c8);
RNDr(S, W, 27, 0xbf597fc7);
RNDr(S, W, 28, 0xc6e00bf3);
RNDr(S, W, 29, 0xd5a79147);
RNDr(S, W, 30, 0x06ca6351);
RNDr(S, W, 31, 0x14292967);
RNDr(S, W, 32, 0x27b70a85);
RNDr(S, W, 33, 0x2e1b2138);
RNDr(S, W, 34, 0x4d2c6dfc);
RNDr(S, W, 35, 0x53380d13);
RNDr(S, W, 36, 0x650a7354);
RNDr(S, W, 37, 0x766a0abb);
RNDr(S, W, 38, 0x81c2c92e);
RNDr(S, W, 39, 0x92722c85);
RNDr(S, W, 40, 0xa2bfe8a1);
RNDr(S, W, 41, 0xa81a664b);
RNDr(S, W, 42, 0xc24b8b70);
RNDr(S, W, 43, 0xc76c51a3);
RNDr(S, W, 44, 0xd192e819);
RNDr(S, W, 45, 0xd6990624);
RNDr(S, W, 46, 0xf40e3585);
RNDr(S, W, 47, 0x106aa070);
RNDr(S, W, 48, 0x19a4c116);
RNDr(S, W, 49, 0x1e376c08);
RNDr(S, W, 50, 0x2748774c);
RNDr(S, W, 51, 0x34b0bcb5);
RNDr(S, W, 52, 0x391c0cb3);
RNDr(S, W, 53, 0x4ed8aa4a);
RNDr(S, W, 54, 0x5b9cca4f);
RNDr(S, W, 55, 0x682e6ff3);
RNDr(S, W, 56, 0x748f82ee);
RNDr(S, W, 57, 0x78a5636f);
RNDr(S, W, 58, 0x84c87814);
RNDr(S, W, 59, 0x8cc70208);
RNDr(S, W, 60, 0x90befffa);
RNDr(S, W, 61, 0xa4506ceb);
RNDr(S, W, 62, 0xbef9a3f7);
RNDr(S, W, 63, 0xc67178f2);
/* 4. Mix local working variables into global state */
for (i = 0; i < 8; i++) {
state[i] += S[i];
}
/* Clean the stack. */
memset(W, 0, 256);
memset(S, 0, 32);
t0 = t1 = 0;
}
static unsigned char PAD[64] = {
0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Add padding and terminating bit-count. */
static void
SHA256_Pad(SHA256_CTX_Y * ctx)
{
unsigned char len[8];
uint32_t r, plen;
/*
* Convert length to a vector of bytes -- we do this now rather
* than later because the length will change after we pad.
*/
be32enc_vect(len, ctx->count, 8);
/* Add 1--64 bytes so that the resulting length is 56 mod 64 */
r = (ctx->count[1] >> 3) & 0x3f;
plen = (r < 56) ? (56 - r) : (120 - r);
SHA256_Update_Y(ctx, PAD, (size_t)plen);
/* Add the terminating bit-count */
SHA256_Update_Y(ctx, len, 8);
}
/* SHA-256 initialization. Begins a SHA-256 operation. */
void
SHA256_Init_Y(SHA256_CTX_Y * ctx)
{
/* Zero bits processed so far */
ctx->count[0] = ctx->count[1] = 0;
/* Magic initialization constants */
ctx->state[0] = 0x6A09E667;
ctx->state[1] = 0xBB67AE85;
ctx->state[2] = 0x3C6EF372;
ctx->state[3] = 0xA54FF53A;
ctx->state[4] = 0x510E527F;
ctx->state[5] = 0x9B05688C;
ctx->state[6] = 0x1F83D9AB;
ctx->state[7] = 0x5BE0CD19;
}
/* Add bytes into the hash */
void
SHA256_Update_Y(SHA256_CTX_Y * ctx, const void *in, size_t len)
{
uint32_t bitlen[2];
uint32_t r;
const unsigned char *src = in;
/* Number of bytes left in the buffer from previous updates */
r = (ctx->count[1] >> 3) & 0x3f;
/* Convert the length into a number of bits */
bitlen[1] = ((uint32_t)len) << 3;
bitlen[0] = (uint32_t)(len >> 29);
/* Update number of bits */
if ((ctx->count[1] += bitlen[1]) < bitlen[1])
ctx->count[0]++;
ctx->count[0] += bitlen[0];
/* Handle the case where we don't need to perform any transforms */
if (len < 64 - r) {
memcpy(&ctx->buf[r], src, len);
return;
}
/* Finish the current block */
memcpy(&ctx->buf[r], src, 64 - r);
SHA256_Transform(ctx->state, ctx->buf);
src += 64 - r;
len -= 64 - r;
/* Perform complete blocks */
while (len >= 64) {
SHA256_Transform(ctx->state, src);
src += 64;
len -= 64;
}
/* Copy left over data into buffer */
memcpy(ctx->buf, src, len);
}
/*
* SHA-256 finalization. Pads the input data, exports the hash value,
* and clears the context state.
*/
void
SHA256_Final_Y(unsigned char digest[32], SHA256_CTX_Y * ctx)
{
/* Add padding */
SHA256_Pad(ctx);
/* Write the hash */
be32enc_vect(digest, ctx->state, 32);
/* Clear the context state */
memset((void *)ctx, 0, sizeof(*ctx));
}
/* Initialize an HMAC-SHA256 operation with the given key. */
void
HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y * ctx, const void * _K, size_t Klen)
{
unsigned char pad[64];
unsigned char khash[32];
const unsigned char * K = _K;
size_t i;
/* If Klen > 64, the key is really SHA256(K). */
if (Klen > 64) {
SHA256_Init_Y(&ctx->ictx);
SHA256_Update_Y(&ctx->ictx, K, Klen);
SHA256_Final_Y(khash, &ctx->ictx);
K = khash;
Klen = 32;
}
/* Inner SHA256 operation is SHA256(K xor [block of 0x36] || data). */
SHA256_Init_Y(&ctx->ictx);
memset(pad, 0x36, 64);
for (i = 0; i < Klen; i++) {
pad[i] ^= K[i];
}
SHA256_Update_Y(&ctx->ictx, pad, 64);
/* Outer SHA256 operation is SHA256(K xor [block of 0x5c] || hash). */
SHA256_Init_Y(&ctx->octx);
memset(pad, 0x5c, 64);
for (i = 0; i < Klen; i++)
{
pad[i] ^= K[i];
}
SHA256_Update_Y(&ctx->octx, pad, 64);
/* Clean the stack. */
memset(khash, 0, 32);
}
/* Add bytes to the HMAC-SHA256 operation. */
void
HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y * ctx, const void *in, size_t len)
{
/* Feed data to the inner SHA256 operation. */
SHA256_Update_Y(&ctx->ictx, in, len);
}
/* Finish an HMAC-SHA256 operation. */
void
HMAC_SHA256_Final_Y(unsigned char digest[32], HMAC_SHA256_CTX_Y * ctx)
{
unsigned char ihash[32];
/* Finish the inner SHA256 operation. */
SHA256_Final_Y(ihash, &ctx->ictx);
/* Feed the inner hash to the outer SHA256 operation. */
SHA256_Update_Y(&ctx->octx, ihash, 32);
/* Finish the outer SHA256 operation. */
SHA256_Final_Y(digest, &ctx->octx);
/* Clean the stack. */
memset(ihash, 0, 32);
}
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void
PBKDF2_SHA256(const uint8_t * passwd, size_t passwdlen, const uint8_t * salt,
size_t saltlen, uint64_t c, uint8_t * buf, size_t dkLen)
{
HMAC_SHA256_CTX_Y PShctx, hctx;
size_t i;
uint8_t ivec[4];
uint8_t U[32];
uint8_t T[32];
uint64_t j;
int k;
size_t clen;
/* Compute HMAC state after processing P and S. */
HMAC_SHA256_Init_Y(&PShctx, passwd, passwdlen);
HMAC_SHA256_Update_Y(&PShctx, salt, saltlen);
/* Iterate through the blocks. */
for (i = 0; i * 32 < dkLen; i++) {
/* Generate INT(i + 1). */
be32enc(ivec, (uint32_t)(i + 1));
/* Compute U_1 = PRF(P, S || INT(i)). */
memcpy(&hctx, &PShctx, sizeof(HMAC_SHA256_CTX_Y));
HMAC_SHA256_Update_Y(&hctx, ivec, 4);
HMAC_SHA256_Final_Y(U, &hctx);
/* T_i = U_1 ... */
memcpy(T, U, 32);
for (j = 2; j <= c; j++) {
/* Compute U_j. */
HMAC_SHA256_Init_Y(&hctx, passwd, passwdlen);
HMAC_SHA256_Update_Y(&hctx, U, 32);
HMAC_SHA256_Final_Y(U, &hctx);
/* ... xor U_j ... */
for (k = 0; k < 32; k++)
T[k] ^= U[k];
}
/* Copy as many bytes as necessary into buf. */
clen = dkLen - i * 32;
if (clen > 32)
clen = 32;
memcpy(&buf[i * 32], T, clen);
}
/* Clean PShctx, since we never called _Final on it. */
memset(&PShctx, 0, sizeof(HMAC_SHA256_CTX_Y));
}

63
sph/sha256_Y.h

@ -0,0 +1,63 @@ @@ -0,0 +1,63 @@
/*-
* Copyright 2005,2007,2009 Colin Percival
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: src/lib/libmd/sha256_Y.h,v 1.2 2006/01/17 15:35:56 phk Exp $
*/
#ifndef _SHA256_H_
#define _SHA256_H_
#include <sys/types.h>
#include <stdint.h>
typedef struct SHA256Context {
uint32_t state[8];
uint32_t count[2];
unsigned char buf[64];
} SHA256_CTX_Y;
typedef struct HMAC_SHA256Context {
SHA256_CTX_Y ictx;
SHA256_CTX_Y octx;
} HMAC_SHA256_CTX_Y;
void SHA256_Init_Y(SHA256_CTX_Y *);
void SHA256_Update_Y(SHA256_CTX_Y *, const void *, size_t);
void SHA256_Final_Y(unsigned char [32], SHA256_CTX_Y *);
void HMAC_SHA256_Init_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
void HMAC_SHA256_Update_Y(HMAC_SHA256_CTX_Y *, const void *, size_t);
void HMAC_SHA256_Final_Y(unsigned char [32], HMAC_SHA256_CTX_Y *);
/**
* PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
* Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
* write the output to buf. The value dkLen must be at most 32 * (2^32 - 1).
*/
void PBKDF2_SHA256(const uint8_t *, size_t, const uint8_t *, size_t,
uint64_t, uint8_t *, size_t);
#endif /* !_SHA256_H_ */
Loading…
Cancel
Save