Browse Source

Add Lyra2 algo, based on Vertcoin published code

Seems to be djm34 work, i recognize the code style ;)

Code was cleaned/indented and adapted to my fork...

Only usable on the test pool until 16 december 2014!
master
Tanguy Pruvot 10 years ago
parent
commit
c5b349e079
  1. 0
      Algo256/blake256.cu
  2. 250
      Algo256/cuda_blake256.cu
  3. 6
      Algo256/cuda_fugue256.cu
  4. 309
      Algo256/cuda_groestl256.cu
  5. 174
      Algo256/cuda_keccak256.cu
  6. 196
      Algo256/cuda_skein256.cu
  7. 0
      Algo256/keccak256.cu
  8. 15
      Makefile.am
  9. 10
      README.txt
  10. 8
      ccminer.cpp
  11. 24
      ccminer.vcxproj
  12. 58
      ccminer.vcxproj.filters
  13. 97
      cuda_helper.h
  14. 211
      lyra2/Lyra2.c
  15. 50
      lyra2/Lyra2.h
  16. 755
      lyra2/Sponge.c
  17. 108
      lyra2/Sponge.h
  18. 536
      lyra2/cuda_lyra2.cu
  19. 133
      lyra2/lyra2RE.cu
  20. 5
      miner.h
  21. 14
      util.cpp

0
blake32.cu → Algo256/blake256.cu

250
Algo256/cuda_blake256.cu

@ -0,0 +1,250 @@
/**
* Blake-256 Cuda Kernel (Tested on SM 5.0)
*
* Tanguy Pruvot - Nov. 2014
*/
extern "C" {
#include "sph/sph_blake.h"
}
#include "cuda_helper.h"
#include <memory.h>
static __device__ uint64_t cuda_swab32ll(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
}
__constant__ static uint32_t c_data[20];
__constant__ static uint32_t sigma[16][16];
static uint32_t c_sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
static const uint32_t c_IV256[8] = {
0x6A09E667, 0xBB67AE85,
0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C,
0x1F83D9AB, 0x5BE0CD19
};
__device__ __constant__ static uint32_t cpu_h[8];
__device__ __constant__ static uint32_t u256[16];
static const uint32_t c_u256[16] = {
0x243F6A88, 0x85A308D3,
0x13198A2E, 0x03707344,
0xA4093822, 0x299F31D0,
0x082EFA98, 0xEC4E6C89,
0x452821E6, 0x38D01377,
0xBE5466CF, 0x34E90C6C,
0xC0AC29B7, 0xC97C50DD,
0x3F84D5B5, 0xB5470917
};
#define GS2(a,b,c,d,x) { \
const uint32_t idx1 = sigma[r][x]; \
const uint32_t idx2 = sigma[r][x+1]; \
v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
v[c] += v[d]; \
v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
\
v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
v[c] += v[d]; \
v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
}
//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define hostGS(a,b,c,d,x) { \
const uint32_t idx1 = c_sigma[r][x]; \
const uint32_t idx2 = c_sigma[r][x+1]; \
v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
v[d] = ROTR32(v[d] ^ v[a], 16); \
v[c] += v[d]; \
v[b] = ROTR32(v[b] ^ v[c], 12); \
\
v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \
v[d] = ROTR32(v[d] ^ v[a], 8); \
v[c] += v[d]; \
v[b] = ROTR32(v[b] ^ v[c], 7); \
}
/* Second part (64-80) msg never change, store it */
__device__ __constant__ static const uint32_t c_Padding[16] = {
0, 0, 0, 0,
0x80000000, 0, 0, 0,
0, 0, 0, 0,
0, 1, 0, 640,
};
__host__ __forceinline__
static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0)
{
uint32_t m[16];
uint32_t v[16];
for (int i = 0; i < 16; i++) {
m[i] = block[i];
}
for (int i = 0; i < 8; i++)
v[i] = h[i];
v[8] = c_u256[0];
v[9] = c_u256[1];
v[10] = c_u256[2];
v[11] = c_u256[3];
v[12] = c_u256[4] ^ T0;
v[13] = c_u256[5] ^ T0;
v[14] = c_u256[6];
v[15] = c_u256[7];
for (int r = 0; r < 14; r++) {
/* column step */
hostGS(0, 4, 0x8, 0xC, 0x0);
hostGS(1, 5, 0x9, 0xD, 0x2);
hostGS(2, 6, 0xA, 0xE, 0x4);
hostGS(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
hostGS(0, 5, 0xA, 0xF, 0x8);
hostGS(1, 6, 0xB, 0xC, 0xA);
hostGS(2, 7, 0x8, 0xD, 0xC);
hostGS(3, 4, 0x9, 0xE, 0xE);
}
for (int i = 0; i < 16; i++) {
int j = i & 7;
h[j] ^= v[i];
}
}
__device__ __forceinline__
static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
{
uint32_t m[16];
uint32_t v[16];
m[0] = block[0];
m[1] = block[1];
m[2] = block[2];
m[3] = block[3];
#pragma unroll
for (int i = 4; i < 16; i++) {
m[i] = c_Padding[i];
}
#pragma unroll 8
for (int i = 0; i < 8; i++)
v[i] = h[i];
v[8] = u256[0];
v[9] = u256[1];
v[10] = u256[2];
v[11] = u256[3];
v[12] = u256[4] ^ T0;
v[13] = u256[5] ^ T0;
v[14] = u256[6];
v[15] = u256[7];
#pragma unroll 14
for (int r = 0; r < 14; r++) {
/* column step */
GS2(0, 4, 0x8, 0xC, 0x0);
GS2(1, 5, 0x9, 0xD, 0x2);
GS2(2, 6, 0xA, 0xE, 0x4);
GS2(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
GS2(0, 5, 0xA, 0xF, 0x8);
GS2(1, 6, 0xB, 0xC, 0xA);
GS2(2, 7, 0x8, 0xD, 0xC);
GS2(3, 4, 0x9, 0xE, 0xE);
}
#pragma unroll 16
for (int i = 0; i < 16; i++) {
int j = i & 7;
h[j] ^= v[i];
}
}
__global__ __launch_bounds__(256,3)
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t h[8];
uint32_t input[4];
#pragma unroll 8
for (int i = 0; i<8; i++) { h[i] = cpu_h[i];}
#pragma unroll 3
for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i];
input[3] = nonce;
blake256_compress2nd(h, input, 640);
#pragma unroll
for (int i = 0; i<4; i++) {
Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
}
}
}
__host__
void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
{
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__
void blake256_cpu_setBlock_80(uint32_t *pdata)
{
uint32_t h[8];
uint32_t data[20];
memcpy(data, pdata, 80);
for (int i = 0; i<8; i++) {
h[i] = c_IV256[i];
}
blake256_compress1st(h, pdata, 512);
cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice);
}
__host__
void blake256_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
}

6
cuda_fugue256.cu → Algo256/cuda_fugue256.cu

@ -571,7 +571,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
for(int i=0;i<30;i++) for(int i=0;i<30;i++)
sc[i] = GPUstate[i]; sc[i] = GPUstate[i];
uint32_t nounce = startNounce + thread; // muss noch ermittelt werden uint32_t nounce = startNounce + thread; // muss noch ermittelt werden
uint32_t q; uint32_t q;
@ -687,7 +687,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
int i; int i;
bool rc = true; bool rc = true;
for (i = 7; i >= 0; i--) { for (i = 7; i >= 0; i--) {
if (hash[i] > pTarget[i]) { if (hash[i] > pTarget[i]) {
rc = false; rc = false;
@ -730,7 +730,7 @@ void fugue256_cpu_init(int thr_id, int threads)
// Speicher für alle Ergebnisse belegen // Speicher für alle Ergebnisse belegen
cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads); cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
} }
__host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn) __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)

309
Algo256/cuda_groestl256.cu

@ -0,0 +1,309 @@
#include <memory.h>
#include "cuda_helper.h"
uint32_t *d_gnounce[8];
uint32_t *d_GNonce[8];
__constant__ uint32_t pTarget[8];
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define C32e(x) \
((SPH_C32(x) >> 24) \
| ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \
| ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \
| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
#define PC32up(j, r) ((uint32_t)((j) + (r)))
#define PC32dn(j, r) 0
#define QC32up(j, r) 0xFFFFFFFF
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
#define B32_0(x) __byte_perm(x, 0, 0x4440)
//((x) & 0xFF)
#define B32_1(x) __byte_perm(x, 0, 0x4441)
//(((x) >> 8) & 0xFF)
#define B32_2(x) __byte_perm(x, 0, 0x4442)
//(((x) >> 16) & 0xFF)
#define B32_3(x) __byte_perm(x, 0, 0x4443)
//((x) >> 24)
#define MAXWELL_OR_FERMI 1
#if MAXWELL_OR_FERMI
#define USE_SHARED 1
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
#if USE_SHARED
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
#else
#define T0up(x) tex1Dfetch(t0up2, x)
#define T0dn(x) tex1Dfetch(t0dn2, x)
#define T1up(x) tex1Dfetch(t1up2, x)
#define T1dn(x) tex1Dfetch(t1dn2, x)
#define T2up(x) tex1Dfetch(t2up2, x)
#define T2dn(x) tex1Dfetch(t2dn2, x)
#define T3up(x) tex1Dfetch(t3up2, x)
#define T3dn(x) tex1Dfetch(t3dn2, x)
#endif
#else
#define USE_SHARED 1
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) tex1Dfetch(t0dn2, x)
#define T1up(x) tex1Dfetch(t1up2, x)
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) tex1Dfetch(t2up2, x)
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) tex1Dfetch(t3dn2, x)
#endif
texture<unsigned int, 1, cudaReadModeElementType> t0up2;
texture<unsigned int, 1, cudaReadModeElementType> t0dn2;
texture<unsigned int, 1, cudaReadModeElementType> t1up2;
texture<unsigned int, 1, cudaReadModeElementType> t1dn2;
texture<unsigned int, 1, cudaReadModeElementType> t2up2;
texture<unsigned int, 1, cudaReadModeElementType> t2dn2;
texture<unsigned int, 1, cudaReadModeElementType> t3up2;
texture<unsigned int, 1, cudaReadModeElementType> t3dn2;
#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
t[d0] = T0up(B32_0(a[b0])) \
^ T1up(B32_1(a[b1])) \
^ T2up(B32_2(a[b2])) \
^ T3up(B32_3(a[b3])) \
^ T0dn(B32_0(a[b4])) \
^ T1dn(B32_1(a[b5])) \
^ T2dn(B32_2(a[b6])) \
^ T3dn(B32_3(a[b7])); \
t[d1] = T0dn(B32_0(a[b0])) \
^ T1dn(B32_1(a[b1])) \
^ T2dn(B32_2(a[b2])) \
^ T3dn(B32_3(a[b3])) \
^ T0up(B32_0(a[b4])) \
^ T1up(B32_1(a[b5])) \
^ T2up(B32_2(a[b6])) \
^ T3up(B32_3(a[b7])); \
} while (0)
extern uint32_t T0up_cpu[];
extern uint32_t T0dn_cpu[];
extern uint32_t T1up_cpu[];
extern uint32_t T1dn_cpu[];
extern uint32_t T2up_cpu[];
extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
__device__ __forceinline__
void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs)
{
#pragma unroll 10
for (int r = 0; r<10; r++)
{
uint32_t t[16];
a[0x0] ^= PC32up(0x00, r);
a[0x2] ^= PC32up(0x10, r);
a[0x4] ^= PC32up(0x20, r);
a[0x6] ^= PC32up(0x30, r);
a[0x8] ^= PC32up(0x40, r);
a[0xA] ^= PC32up(0x50, r);
a[0xC] ^= PC32up(0x60, r);
a[0xE] ^= PC32up(0x70, r);
RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF);
RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1);
RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3);
RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5);
RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7);
RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9);
RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB);
RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD);
#pragma unroll 16
for (int k = 0; k<16; k++)
a[k] = t[k];
}
}
__device__ __forceinline__
void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs)
{
#pragma unroll
for (int r = 0; r<10; r++)
{
uint32_t t[16];
a[0x0] ^= QC32up(0x00, r);
a[0x1] ^= QC32dn(0x00, r);
a[0x2] ^= QC32up(0x10, r);
a[0x3] ^= QC32dn(0x10, r);
a[0x4] ^= QC32up(0x20, r);
a[0x5] ^= QC32dn(0x20, r);
a[0x6] ^= QC32up(0x30, r);
a[0x7] ^= QC32dn(0x30, r);
a[0x8] ^= QC32up(0x40, r);
a[0x9] ^= QC32dn(0x40, r);
a[0xA] ^= QC32up(0x50, r);
a[0xB] ^= QC32dn(0x50, r);
a[0xC] ^= QC32up(0x60, r);
a[0xD] ^= QC32dn(0x60, r);
a[0xE] ^= QC32up(0x70, r);
a[0xF] ^= QC32dn(0x70, r);
RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD);
RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF);
RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1);
RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3);
RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5);
RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7);
RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9);
RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB);
#pragma unroll
for (int k = 0; k<16; k++)
a[k] = t[k];
}
}
__global__ __launch_bounds__(256,1)
void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector)
{
#if USE_SHARED
extern __shared__ char mixtabs[];
if (threadIdx.x < 256) {
*((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(t0up2, threadIdx.x);
*((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(t0dn2, threadIdx.x);
*((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(t1up2, threadIdx.x);
*((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(t1dn2, threadIdx.x);
*((uint32_t*)mixtabs + (1024 + threadIdx.x)) = tex1Dfetch(t2up2, threadIdx.x);
*((uint32_t*)mixtabs + (1280 + threadIdx.x)) = tex1Dfetch(t2dn2, threadIdx.x);
*((uint32_t*)mixtabs + (1536 + threadIdx.x)) = tex1Dfetch(t3up2, threadIdx.x);
*((uint32_t*)mixtabs + (1792 + threadIdx.x)) = tex1Dfetch(t3dn2, threadIdx.x);
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// GROESTL
uint32_t message[16];
uint32_t state[16];
#pragma unroll
for (int k = 0; k<4; k++)
LOHI(message[2*k], message[2*k+1], outputHash[k*threads+thread]);
#pragma unroll
for (int k = 9; k<15; k++)
message[k] = 0;
message[8] = 0x80;
message[15] = 0x01000000;
#pragma unroll 16
for (int u = 0; u<16; u++)
state[u] = message[u];
state[15] ^= 0x10000;
// Perm
#if USE_SHARED
groestl256_perm_P(thread, state, mixtabs);
state[15] ^= 0x10000;
groestl256_perm_Q(thread, message, mixtabs);
#else
groestl256_perm_P(thread, state, NULL);
state[15] ^= 0x10000;
groestl256_perm_P(thread, message, NULL);
#endif
#pragma unroll 16
for (int u = 0; u<16; u++) state[u] ^= message[u];
#pragma unroll 16
for (int u = 0; u<16; u++) message[u] = state[u];
#if USE_SHARED
groestl256_perm_P(thread, message, mixtabs);
#else
groestl256_perm_P(thread, message, NULL);
#endif
state[14] ^= message[14];
state[15] ^= message[15];
uint32_t nonce = startNounce + thread;
if (state[15] <= pTarget[7]) {
nonceVector[0] = nonce;
}
}
}
#define texDef(texname, texmem, texsource, texsize) \
unsigned int *texmem; \
cudaMalloc(&texmem, texsize); \
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \
texname.addressMode[0] = cudaAddressModeClamp; \
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
__host__
void groestl256_cpu_init(int thr_id, int threads)
{
// Texturen mit obigem Makro initialisieren
texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
texDef(t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
texDef(t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
texDef(t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
texDef(t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
cudaMalloc(&d_GNonce[thr_id], sizeof(uint32_t));
cudaMallocHost(&d_gnounce[thr_id], 1*sizeof(uint32_t));
}
__host__
uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
uint32_t result = 0xffffffff;
cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t));
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0;
#endif
groestl256_gpu_hash32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]);
MyStreamSynchronize(NULL, order, thr_id);
cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
result = *d_gnounce[thr_id];
return result;
}
__host__
void groestl256_setTarget(const void *pTargetIn)
{
cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}

174
keccak/cuda_keccak256.cu → Algo256/cuda_keccak256.cu

@ -27,11 +27,81 @@ uint32_t *d_KNonce[8];
__constant__ uint32_t pTarget[8]; __constant__ uint32_t pTarget[8];
__constant__ uint64_t keccak_round_constants[24]; __constant__ uint64_t keccak_round_constants[24];
__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
#if __CUDA_ARCH__ >= 350
__device__ __forceinline__
static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
{
size_t i;
uint2 t[5], u[5], v, w;
#pragma unroll
for (i = 0; i < 24; i++) {
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
u[0] = t[4] ^ ROL2(t[1], 1);
u[1] = t[0] ^ ROL2(t[2], 1);
u[2] = t[1] ^ ROL2(t[3], 1);
u[3] = t[2] ^ ROL2(t[4], 1);
u[4] = t[3] ^ ROL2(t[0], 1);
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
/* rho pi: b[..] = rotl(a[..], ..) */
v = s[1];
s[1] = ROL2(s[6], 44);
s[6] = ROL2(s[9], 20);
s[9] = ROL2(s[22], 61);
s[22] = ROL2(s[14], 39);
s[14] = ROL2(s[20], 18);
s[20] = ROL2(s[2], 62);
s[2] = ROL2(s[12], 43);
s[12] = ROL2(s[13], 25);
s[13] = ROL2(s[19], 8);
s[19] = ROL2(s[23], 56);
s[23] = ROL2(s[15], 41);
s[15] = ROL2(s[4], 27);
s[4] = ROL2(s[24], 14);
s[24] = ROL2(s[21], 2);
s[21] = ROL2(s[8], 55);
s[8] = ROL2(s[16], 45);
s[16] = ROL2(s[5], 36);
s[5] = ROL2(s[3], 28);
s[3] = ROL2(s[18], 21);
s[18] = ROL2(s[17], 15);
s[17] = ROL2(s[11], 10);
s[11] = ROL2(s[7], 6);
s[7] = ROL2(s[10], 3);
s[10] = ROL2(v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
static __device__ __forceinline__ /* iota: a[0,0] ^= round constant */
void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) { s[0] ^= vectorize(keccak_round_constants[i]);
}
}
#endif
__device__ __forceinline__
static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
{
size_t i; size_t i;
uint64_t t[5], u[5], v, w; uint64_t t[5], u[5], v, w;
@ -109,14 +179,16 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash,
//#pragma unroll 25 //#pragma unroll 25
for (int i=0; i<25; i++) { for (int i=0; i<25; i++) {
if(i<9) {keccak_gpu_state[i] = c_PaddedMessage80[i];} if (i < 9)
else {keccak_gpu_state[i] = 0;} keccak_gpu_state[i] = c_PaddedMessage80[i];
else
keccak_gpu_state[i] = 0;
} }
keccak_gpu_state[9]=REPLACE_HIWORD(c_PaddedMessage80[9],cuda_swab32(nounce)); keccak_gpu_state[9] = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
keccak_gpu_state[10]=0x0000000000000001; keccak_gpu_state[10] = 0x0000000000000001;
keccak_gpu_state[16]=0x8000000000000000; keccak_gpu_state[16] = 0x8000000000000000;
keccak_block(keccak_gpu_state,keccak_round_constants); keccak_blockv30(keccak_gpu_state, keccak_round_constants);
bool rc = false; bool rc = false;
if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;} if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;}
@ -125,18 +197,7 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash,
if(resNounce[0] > nounce) if(resNounce[0] > nounce)
resNounce[0] = nounce; resNounce[0] = nounce;
} }
} //thread }
}
void keccak256_cpu_init(int thr_id, int threads)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants,
host_keccak_round_constants,
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
} }
__host__ __host__
@ -161,6 +222,66 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui
return result; return result;
} }
#ifdef _MSC_VER
#define UINT2(a, b) { a, b }
#else
#define UINT2(a, b) (uint2) { a, b }
#endif
__global__ __launch_bounds__(256,3)
void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
uint2 keccak_gpu_state[25];
#pragma unroll 25
for (int i = 0; i<25; i++) {
if (i < 4)
keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
else
keccak_gpu_state[i] = UINT2(0, 0);
}
keccak_gpu_state[4] = UINT2(1, 0);
keccak_gpu_state[16] = UINT2(0, 0x80000000);
keccak_blockv35(keccak_gpu_state, keccak_round_constants);
#pragma unroll 4
for (int i=0; i<4;i++)
outputHash[i*threads+thread]=devectorize(keccak_gpu_state[i]);
#else
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i = 0; i<25; i++) {
if (i<4)
keccak_gpu_state[i] = outputHash[i*threads+thread];
else
keccak_gpu_state[i] = 0;
}
keccak_gpu_state[4] = 0x0000000000000001;
keccak_gpu_state[16] = 0x8000000000000000;
keccak_blockv30(keccak_gpu_state, keccak_round_constants);
#pragma unroll 4
for (int i = 0; i<4; i++)
outputHash[i*threads + thread] = keccak_gpu_state[i];
#endif
}
}
__host__
void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
keccak256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ __host__
void keccak256_setBlock_80(void *pdata,const void *pTargetIn) void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
{ {
@ -168,4 +289,13 @@ void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
memcpy(PaddedMessage, pdata, 80); memcpy(PaddedMessage, pdata, 80);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
} }
__host__
void keccak256_cpu_init(int thr_id, int threads)
{
CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
}

196
Algo256/cuda_skein256.cu

@ -0,0 +1,196 @@
#include <memory.h>
#include "cuda_helper.h"
#if 0
static __constant__ uint64_t SKEIN_IV512_256[8] = {
0xCCD044A12FDB3E13, 0xE83590301A79A9EB,
0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB,
0xEC06025E74DD7683, 0xE7A436CDC4746251,
0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13
};
#endif
static __constant__ uint2 vSKEIN_IV512_256[8] = {
{ 0x2FDB3E13, 0xCCD044A1 },
{ 0x1A79A9EB, 0xE8359030 },
{ 0x4F816E6F, 0x55AEA061 },
{ 0xAE9B94DB, 0x2A2767A4 },
{ 0x74DD7683, 0xEC06025E },
{ 0xC4746251, 0xE7A436CD },
{ 0x393AD185, 0xC36FBAF9 },
{ 0x33EDFC13, 0x3EEDBA18 }
};
static __constant__ int ROT256[8][4] =
{
46,36, 19, 37,
33,27, 14, 42,
17,49, 36, 39,
44, 9, 54, 56,
39,30, 34, 24,
13,50, 10, 17,
25,29, 39, 43,
8, 35, 56, 22,
};
static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA};
static __constant__ uint2 t12[6] = {
{ 0x20, 0 },
{ 0, 0xf0000000 },
{ 0x20, 0xf0000000 },
{ 0x08, 0 },
{ 0, 0xff000000 },
{ 0x08, 0xff000000 }
};
#if 0
static __constant__ uint64_t t12_30[6] = {
0x20,
0xf000000000000000,
0xf000000000000020,
0x08,
0xff00000000000000,
0xff00000000000008
};
#endif
static __forceinline__ __device__
void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT)
{
p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]); p1 ^= p0;
p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]); p3 ^= p2;
p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]); p5 ^= p4;
p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]); p7 ^= p6;
}
static __forceinline__ __device__
void Round_8_512v35(uint2 *ks, uint2 *ts,
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3,
uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
{
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 0);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 1);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 2);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 3);
p0 += ks[((R)+0) % 9]; /* inject the key schedule value */
p1 += ks[((R)+1) % 9];
p2 += ks[((R)+2) % 9];
p3 += ks[((R)+3) % 9];
p4 += ks[((R)+4) % 9];
p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3];
p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
p7 += ks[((R)+7) % 9] + make_uint2((R),0);
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 4);
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 5);
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 6);
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 7);
p0 += ks[((R)+1) % 9]; /* inject the key schedule value */
p1 += ks[((R)+2) % 9];
p2 += ks[((R)+3) % 9];
p3 += ks[((R)+4) % 9];
p4 += ks[((R)+5) % 9];
p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3];
p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0);
}
__global__ __launch_bounds__(256,3)
void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2 h[9];
uint2 t[3];
uint2 dt0,dt1,dt2,dt3;
uint2 p0, p1, p2, p3, p4, p5, p6, p7;
h[8] = skein_ks_parity;
for (int i = 0; i<8; i++) {
h[i] = vSKEIN_IV512_256[i];
h[8] ^= h[i];
}
t[0]=t12[0];
t[1]=t12[1];
t[2]=t12[2];
LOHI(dt0.x,dt0.y,outputHash[thread]);
LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
p0 = h[0] + dt0;
p1 = h[1] + dt1;
p2 = h[2] + dt2;
p3 = h[3] + dt3;
p4 = h[4];
p5 = h[5] + t[0];
p6 = h[6] + t[1];
p7 = h[7];
#pragma unroll
for (int i = 1; i<19; i+=2) {
Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i);
}
p0 ^= dt0;
p1 ^= dt1;
p2 ^= dt2;
p3 ^= dt3;
h[0] = p0;
h[1] = p1;
h[2] = p2;
h[3] = p3;
h[4] = p4;
h[5] = p5;
h[6] = p6;
h[7] = p7;
h[8] = skein_ks_parity;
#pragma unroll 8
for (int i = 0; i<8; i++) {
h[8] ^= h[i];
}
t[0] = t12[3];
t[1] = t12[4];
t[2] = t12[5];
p5 += t[0]; //p5 already equal h[5]
p6 += t[1];
#pragma unroll
for (int i = 1; i<19; i+=2) {
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
}
outputHash[thread] = devectorize(p0);
outputHash[threads+thread] = devectorize(p1);
outputHash[2*threads+thread] = devectorize(p2);
outputHash[3*threads+thread] = devectorize(p3);
}
}
__host__
void skein256_cpu_init(int thr_id, int threads)
{
//empty
}
__host__
void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
MyStreamSynchronize(NULL, order, thr_id);
}

0
keccak/keccak256.cu → Algo256/keccak256.cu

15
Makefile.am

@ -10,11 +10,11 @@ EXTRA_DIST = autogen.sh README.txt LICENSE.txt \
cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \ cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
SUBDIRS = compat SUBDIRS = compat
bin_PROGRAMS = ccminer bin_PROGRAMS = ccminer
ccminer_SOURCES = elist.h miner.h compat.h \ ccminer_SOURCES = elist.h miner.h compat.h \
compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \
compat/sys/time.h compat/getopt/getopt.h \ compat/sys/time.h compat/getopt/getopt.h \
crc32.c hefty1.c scrypt.c \ crc32.c hefty1.c scrypt.c \
@ -27,17 +27,20 @@ ccminer_SOURCES = elist.h miner.h compat.h \
heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \ heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \ heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
heavy/cuda_sha256.cu heavy/cuda_sha256.h \ heavy/cuda_sha256.cu heavy/cuda_sha256.h \
keccak/cuda_keccak256.cu keccak/keccak256.cu \ fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c uint256.h \
fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
myriadgroestl.cpp cuda_myriadgroestl.cu \ myriadgroestl.cpp cuda_myriadgroestl.cu \
lyra2/Lyra2.c lyra2/Sponge.c \
lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
Algo256/blake256.cu Algo256/keccak256.cu \
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
quark/quarkcoin.cu quark/animecoin.cu \ quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \ quark/cuda_quark_compactionTest.cu \
cuda_nist5.cu blake32.cu pentablake.cu \ cuda_nist5.cu pentablake.cu \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \

10
README.txt

@ -1,5 +1,5 @@
ccMiner release 1.5.0-tpruvot (27 Nov 2014) - "Extra nonce" ccMiner release 1.5.1-tpruvot (16 Dec 2014) - "Vertcoin Lyra2"
--------------------------------------------------------------- ---------------------------------------------------------------
*************************************************************** ***************************************************************
@ -38,6 +38,7 @@ Keccak (Maxcoin)
Deep, Doom and Qubit Deep, Doom and Qubit
Pentablake (Blake 512 x5) Pentablake (Blake 512 x5)
S3 (OneCoin) S3 (OneCoin)
Lyra2RE (new VertCoin algo)
where some of these coins have a VERY NOTABLE nVidia advantage where some of these coins have a VERY NOTABLE nVidia advantage
over competing AMD (OpenCL Only) implementations. over competing AMD (OpenCL Only) implementations.
@ -68,6 +69,7 @@ its command line interface and options.
jackpot use to mine Jackpotcoin jackpot use to mine Jackpotcoin
keccak use to mine Maxcoin keccak use to mine Maxcoin
luffa use to mine Doomcoin luffa use to mine Doomcoin
lyra2 use to mine Vertcoin
mjollnir use to mine Mjollnircoin mjollnir use to mine Mjollnircoin
myr-gr use to mine Myriad-Groest myr-gr use to mine Myriad-Groest
nist5 use to mine TalkCoin nist5 use to mine TalkCoin
@ -169,6 +171,12 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Dec. 2014 v1.5.1 (not released yet!)
Add lyra2 algo for Vertcoin (Release is 16 Dec 2014)
Multiple shares support (2 for the moment)
X11 optimisations (From klaust and sp-hash)
HTML5 WebSocket api compatibility (see api/websocket.htm)
Nov. 27th 2014 v1.5.0 Nov. 27th 2014 v1.5.0
Upgrade compat jansson to 2.6 (for windows) Upgrade compat jansson to 2.6 (for windows)
Add pool mining.set_extranonce support Add pool mining.set_extranonce support

8
ccminer.cpp

@ -138,6 +138,7 @@ enum sha_algos {
ALGO_KECCAK, ALGO_KECCAK,
ALGO_JACKPOT, ALGO_JACKPOT,
ALGO_LUFFA_DOOM, ALGO_LUFFA_DOOM,
ALGO_LYRA,
ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MJOLLNIR, /* Hefty hash */
ALGO_MYR_GR, ALGO_MYR_GR,
ALGO_NIST5, ALGO_NIST5,
@ -167,6 +168,7 @@ static const char *algo_names[] = {
"keccak", "keccak",
"jackpot", "jackpot",
"luffa", "luffa",
"lyra2",
"mjollnir", "mjollnir",
"myr-gr", "myr-gr",
"nist5", "nist5",
@ -272,6 +274,7 @@ Options:\n\
jackpot Jackpot\n\ jackpot Jackpot\n\
keccak Keccak-256 (Maxcoin)\n\ keccak Keccak-256 (Maxcoin)\n\
luffa Doomcoin\n\ luffa Doomcoin\n\
lyra2 VertCoin\n\
mjollnir Mjollnircoin\n\ mjollnir Mjollnircoin\n\
myr-gr Myriad-Groestl\n\ myr-gr Myriad-Groestl\n\
nist5 NIST5 (TalkCoin)\n\ nist5 NIST5 (TalkCoin)\n\
@ -1255,6 +1258,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done); max_nonce, &hashes_done);
break; break;
case ALGO_LYRA:
rc = scanhash_lyra(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_NIST5: case ALGO_NIST5:
rc = scanhash_nist5(thr_id, work.data, work.target, rc = scanhash_nist5(thr_id, work.data, work.target,
max_nonce, &hashes_done); max_nonce, &hashes_done);

24
ccminer.vcxproj

@ -105,7 +105,7 @@
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep> <Keep>false</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration> <CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
</CudaCompile> </CudaCompile>
</ItemDefinitionGroup> </ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@ -173,7 +173,7 @@
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep> <Keep>false</Keep>
<CodeGeneration>compute_30,sm_30;compute_50,sm_50;</CodeGeneration> <CodeGeneration>compute_50,sm_50;</CodeGeneration>
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
<Defines> <Defines>
</Defines> </Defines>
@ -257,6 +257,8 @@
<Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization> <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
<AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions>
</ClCompile> </ClCompile>
<ClCompile Include="lyra2\Lyra2.c" />
<ClCompile Include="lyra2\Sponge.c" />
<ClCompile Include="sph\aes_helper.c" /> <ClCompile Include="sph\aes_helper.c" />
<ClCompile Include="sph\blake.c" /> <ClCompile Include="sph\blake.c" />
<ClCompile Include="sph\bmw.c" /> <ClCompile Include="sph\bmw.c" />
@ -330,13 +332,15 @@
<ClInclude Include="sph\sph_whirlpool.h" /> <ClInclude Include="sph\sph_whirlpool.h" />
<ClInclude Include="uint256.h" /> <ClInclude Include="uint256.h" />
</ItemGroup> </ItemGroup>
<ItemGroup>
<ClInclude Include="lyra2\Lyra2.h" />
<ClInclude Include="lyra2\Sponge.h" />
</ItemGroup>
<ItemGroup> <ItemGroup>
<CudaCompile Include="cuda.cpp" /> <CudaCompile Include="cuda.cpp" />
<CudaCompile Include="bitslice_transformations_quad.cu"> <CudaCompile Include="bitslice_transformations_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild> <ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile> </CudaCompile>
<CudaCompile Include="cuda_fugue256.cu">
</CudaCompile>
<CudaCompile Include="cuda_groestlcoin.cu"> <CudaCompile Include="cuda_groestlcoin.cu">
</CudaCompile> </CudaCompile>
<CudaCompile Include="cuda_myriadgroestl.cu"> <CudaCompile Include="cuda_myriadgroestl.cu">
@ -369,15 +373,19 @@
</CudaCompile> </CudaCompile>
<CudaCompile Include="JHA\jackpotcoin.cu"> <CudaCompile Include="JHA\jackpotcoin.cu">
</CudaCompile> </CudaCompile>
<CudaCompile Include="blake32.cu"> <CudaCompile Include="Algo256\blake256.cu">
<MaxRegCount>64</MaxRegCount> <MaxRegCount>64</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
<FastMath>true</FastMath> <FastMath>true</FastMath>
</CudaCompile> </CudaCompile>
<CudaCompile Include="keccak\cuda_keccak256.cu"> <CudaCompile Include="Algo256\keccak256.cu" />
<CudaCompile Include="Algo256\cuda_blake256.cu" />
<CudaCompile Include="Algo256\cuda_fugue256.cu" />
<CudaCompile Include="Algo256\cuda_groestl256.cu" />
<CudaCompile Include="Algo256\cuda_keccak256.cu">
<MaxRegCount>92</MaxRegCount> <MaxRegCount>92</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile Include="keccak\keccak256.cu" /> <CudaCompile Include="Algo256\cuda_skein256.cu" />
<CudaCompile Include="pentablake.cu"> <CudaCompile Include="pentablake.cu">
<MaxRegCount>80</MaxRegCount> <MaxRegCount>80</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
@ -418,6 +426,8 @@
</CudaCompile> </CudaCompile>
<CudaCompile Include="qubit\qubit_luffa512.cu"> <CudaCompile Include="qubit\qubit_luffa512.cu">
</CudaCompile> </CudaCompile>
<CudaCompile Include="lyra2\lyra2RE.cu" />
<CudaCompile Include="lyra2\cuda_lyra2.cu" />
<CudaCompile Include="x11\cuda_x11_aes.cu"> <CudaCompile Include="x11\cuda_x11_aes.cu">
<ExcludedFromBuild>true</ExcludedFromBuild> <ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile> </CudaCompile>

58
ccminer.vcxproj.filters

@ -61,12 +61,15 @@
<Filter Include="Source Files\jansson"> <Filter Include="Source Files\jansson">
<UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier> <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Source Files\CUDA\keccak">
<UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files\compat\nvapi"> <Filter Include="Header Files\compat\nvapi">
<UniqueIdentifier>{ef6f9983-bda5-4fb2-adfa-ac4f29b74f25}</UniqueIdentifier> <UniqueIdentifier>{ef6f9983-bda5-4fb2-adfa-ac4f29b74f25}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Source Files\CUDA\Algo256">
<UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files\lyra2">
<UniqueIdentifier>{2ff6e4ce-7c92-4cb2-a3ad-c331e94fd81d}</UniqueIdentifier>
</Filter>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClCompile Include="compat\jansson\dump.c"> <ClCompile Include="compat\jansson\dump.c">
@ -213,6 +216,12 @@
<ClCompile Include="compat\jansson\error.c"> <ClCompile Include="compat\jansson\error.c">
<Filter>Source Files\jansson</Filter> <Filter>Source Files\jansson</Filter>
</ClCompile> </ClCompile>
<ClCompile Include="lyra2\Lyra2.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
<ClCompile Include="lyra2\Sponge.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClInclude Include="compat.h"> <ClInclude Include="compat.h">
@ -347,14 +356,17 @@
<ClInclude Include="compat\jansson\jansson_config.h"> <ClInclude Include="compat\jansson\jansson_config.h">
<Filter>Header Files\compat</Filter> <Filter>Header Files\compat</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="lyra2\Lyra2.h">
<Filter>Header Files\lyra2</Filter>
</ClInclude>
<ClInclude Include="lyra2\Sponge.h">
<Filter>Header Files\lyra2</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<CudaCompile Include="cuda.cpp"> <CudaCompile Include="cuda.cpp">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="cuda_fugue256.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="cuda_groestlcoin.cu"> <CudaCompile Include="cuda_groestlcoin.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
@ -505,20 +517,38 @@
<CudaCompile Include="x17\x17.cu"> <CudaCompile Include="x17\x17.cu">
<Filter>Source Files\CUDA\x17</Filter> <Filter>Source Files\CUDA\x17</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="blake32.cu"> <CudaCompile Include="pentablake.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="x11\s3.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="Algo256\blake256.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="pentablake.cu"> <CudaCompile Include="Algo256\keccak256.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="keccak\cuda_keccak256.cu"> <CudaCompile Include="Algo256\cuda_blake256.cu">
<Filter>Source Files\CUDA\keccak</Filter> <Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="keccak\keccak256.cu"> <CudaCompile Include="Algo256\cuda_fugue256.cu">
<Filter>Source Files\CUDA\keccak</Filter> <Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x11\s3.cu"> <CudaCompile Include="Algo256\cuda_groestl256.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>
<CudaCompile Include="Algo256\cuda_keccak256.cu">
<Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>
<CudaCompile Include="Algo256\cuda_skein256.cu">
<Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>
<CudaCompile Include="lyra2\cuda_lyra2.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="lyra2\lyra2RE.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
</ItemGroup> </ItemGroup>
</Project> </Project>

97
cuda_helper.h

@ -355,7 +355,7 @@ uint64_t ROTL64(const uint64_t x, const int offset)
"setp.lt.u32 p, %2, 32;\n\t" "setp.lt.u32 p, %2, 32;\n\t"
"@!p mov.b64 %0, {vl,vh};\n\t" "@!p mov.b64 %0, {vl,vh};\n\t"
"@p mov.b64 %0, {vh,vl};\n\t" "@p mov.b64 %0, {vh,vl};\n\t"
"}" "}"
: "=l"(res) : "l"(x) , "r"(offset) : "=l"(res) : "l"(x) , "r"(offset)
); );
return res; return res;
@ -378,4 +378,99 @@ uint64_t SWAPDWORDS(uint64_t value)
#endif #endif
} }
/* lyra2 - int2 operators */
__device__ __forceinline__
void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
asm("mov.b64 {%0,%1},%2; \n\t"
: "=r"(lo), "=r"(hi) : "l"(x));
}
static __device__ __forceinline__ uint64_t devectorize(uint2 v) { return MAKE_ULONGLONG(v.x, v.y); }
static __device__ __forceinline__ uint2 vectorize(uint64_t v) {
uint2 result;
LOHI(result.x, result.y, v);
return result;
}
static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
{
uint2 result;
asm("{\n\t"
"add.cc.u32 %0,%2,%4; \n\t"
"addc.u32 %1,%3,%5; \n\t"
"}\n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
return result;
}
static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
/**
* basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
* (what does uint64 "*" operator)
*/
static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
{
uint2 result;
asm("{\n\t"
"mul.lo.u32 %0,%2,%4; \n\t"
"mul.hi.u32 %1,%2,%4; \n\t"
"mad.lo.cc.u32 %1,%3,%4,%1; \n\t"
"madc.lo.u32 %1,%3,%5,%1; \n\t"
"}\n\t"
: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
return result;
}
// uint2 method
#if __CUDA_ARCH__ >= 350
__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) {
uint2 result;
if (offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
}
else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
}
return result;
}
#else
__device__ __inline__ uint2 ROR2(const uint2 v, const int n) {
uint2 result;
result.x = (((v.x) >> (n)) | ((v.x) << (64 - (n))));
result.y = (((v.y) >> (n)) | ((v.y) << (64 - (n))));
return result;
}
#endif
#if __CUDA_ARCH__ >= 350
__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
uint2 result;
if (offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
}
else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
}
return result;
}
#else
__inline__ __device__ uint2 ROL2(const uint2 v, const int n) {
uint2 result;
result.x = (((v.x) << (n)) | ((v.x) >> (64 - (n))));
result.y = (((v.y) << (n)) | ((v.y) >> (64 - (n))));
return result;
}
#endif
#endif // #ifndef CUDA_HELPER_H #endif // #ifndef CUDA_HELPER_H

211
lyra2/Lyra2.c

@ -0,0 +1,211 @@
/**
* Implementation of the Lyra2 Password Hashing Scheme (PHS).
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "Lyra2.h"
#include "Sponge.h"
/**
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
*
* @param K The derived key to be output by the algorithm
* @param kLen Desired key length
* @param pwd User password
* @param pwdlen Password length
* @param salt Salt
* @param saltlen Salt length
* @param timeCost Parameter to determine the processing time (T)
* @param nRows Number or rows of the memory matrix (R)
* @param nCols Number of columns of the memory matrix (C)
*
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
*/
int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols)
{
//============================= Basic variables ============================//
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
//==========================================================================/
//========== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i);
if (wholeMatrix == NULL) {
return -1;
}
memset(wholeMatrix, 0, (size_t) i);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*));
if (memMatrix == NULL) {
return -1;
}
//Places the pointers in the correct positions
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < (int64_t) nRows; i++) {
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
//==========================================================================/
//============= Getting the password + salt + basil padded with 10*1 ===============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
byte *ptrByte = (byte*) wholeMatrix;
memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
//Prepends the password
memcpy(ptrByte, pwd, (size_t) pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, (size_t) saltlen);
ptrByte += saltlen;
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &saltlen, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &timeCost, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nRows, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
memcpy(ptrByte, &nCols, sizeof (uint64_t));
ptrByte += sizeof (uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//==========================================================================/
//======================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
uint64_t *state = malloc(16 * sizeof (uint64_t));
if (state == NULL) {
return -1;
}
initState(state);
//==========================================================================/
//================================ Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < (int64_t) nBlocksInput; i++) {
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1]);
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < (int64_t) nRows);
//==========================================================================/
//============================ Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= (int64_t) timeCost; tau++) {
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
do {
//Selects a pseudorandom index row*
//------------------------------------------------------------------------------------------
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//------------------------------------------------------------------------------------------
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
} while (row != 0);
}
//==========================================================================/
//============================ Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, memMatrix[rowa]);
//Squeezes the key
squeeze(state, K, (size_t) kLen);
//==========================================================================/
//========================= Freeing the memory =============================//
free(memMatrix);
free(wholeMatrix);
//Wiping out the sponge's internal state before freeing it
memset(state, 0, 16 * sizeof (uint64_t));
free(state);
//==========================================================================/
return 0;
}

50
lyra2/Lyra2.h

@ -0,0 +1,50 @@
/**
* Header file for the Lyra2 Password Hashing Scheme (PHS).
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef LYRA2_H_
#define LYRA2_H_
#include <stdint.h>
typedef unsigned char byte;
//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t)
#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes
#ifdef BLOCK_LEN_BITS
#define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t)
#define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes
#else //default block lenght: 768 bits
#define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t)
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
#endif
#ifndef N_COLS
#define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default
#endif
#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row
int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
#endif /* LYRA2_H_ */

755
lyra2/Sponge.c

@ -0,0 +1,755 @@
/**
* A simple implementation of Blake2b's internal permutation
* in the form of a sponge.
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <string.h>
#include <stdio.h>
#include <time.h>
#include "Sponge.h"
#include "Lyra2.h"
/**
* Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
* receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
* typically have their internal state initialized with zeros, Blake2b's G function
* has a fixed point: if the internal state and message are both filled with zeros. the
* resulting permutation will always be a block filled with zeros; this happens because
* Blake2b does not use the constants originally employed in Blake2 inside its G function,
* relying on the IV for avoiding possible fixed points.
*
* @param state The 1024-bit array to be initialized
*/
void initState(uint64_t state[/*16*/]) {
//First 512 bis are zeros
memset(state, 0, 64);
//Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
state[8] = blake2b_IV[0];
state[9] = blake2b_IV[1];
state[10] = blake2b_IV[2];
state[11] = blake2b_IV[3];
state[12] = blake2b_IV[4];
state[13] = blake2b_IV[5];
state[14] = blake2b_IV[6];
state[15] = blake2b_IV[7];
}
/**
* Execute Blake2b's G function, with all 12 rounds.
*
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
*/
__inline static void blake2bLyra(uint64_t *v) {
ROUND_LYRA(0);
ROUND_LYRA(1);
ROUND_LYRA(2);
ROUND_LYRA(3);
ROUND_LYRA(4);
ROUND_LYRA(5);
ROUND_LYRA(6);
ROUND_LYRA(7);
ROUND_LYRA(8);
ROUND_LYRA(9);
ROUND_LYRA(10);
ROUND_LYRA(11);
}
/**
* Executes a reduced version of Blake2b's G function with only one round
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
*/
__inline static void reducedBlake2bLyra(uint64_t *v) {
ROUND_LYRA(0);
}
/**
* Performs a squeeze operation, using Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param out Array that will receive the data squeezed
* @param len The number of bytes to be squeezed into the "out" array
*/
void squeeze(uint64_t *state, byte *out, unsigned int len) {
int fullBlocks = len / BLOCK_LEN_BYTES;
byte *ptr = out;
int i;
//Squeezes full blocks
for (i = 0; i < fullBlocks; i++) {
memcpy(ptr, state, BLOCK_LEN_BYTES);
blake2bLyra(state);
ptr += BLOCK_LEN_BYTES;
}
//Squeezes remaining bytes
memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
}
/**
* Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
* of type uint64_t), using Blake2b's G function as the internal permutation
*
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_INT64 words)
*/
void absorbBlock(uint64_t *state, const uint64_t *in) {
//XORs the first BLOCK_LEN_INT64 words of "in" with the current state
state[0] ^= in[0];
state[1] ^= in[1];
state[2] ^= in[2];
state[3] ^= in[3];
state[4] ^= in[4];
state[5] ^= in[5];
state[6] ^= in[6];
state[7] ^= in[7];
state[8] ^= in[8];
state[9] ^= in[9];
state[10] ^= in[10];
state[11] ^= in[11];
//Applies the transformation f to the sponge's state
blake2bLyra(state);
}
/**
* Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64
* words of type uint64_t), using Blake2b's G function as the internal permutation
*
* @param state The current state of the sponge
* @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
*/
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
state[0] ^= in[0];
state[1] ^= in[1];
state[2] ^= in[2];
state[3] ^= in[3];
state[4] ^= in[4];
state[5] ^= in[5];
state[6] ^= in[6];
state[7] ^= in[7];
//Applies the transformation f to the sponge's state
blake2bLyra(state);
/*
for(int i = 0; i<16; i++) {
printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32),
(uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32));
}
*/
}
/**
* Performs a reduced squeeze operation for a single row, from the highest to
* the lowest index, using the reduced-round Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param rowOut Row to receive the data squeezed
*/
void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
int i;
//M[row][C-1-col] = H.reduced_squeeze()
for (i = 0; i < N_COLS; i++) {
ptrWord[0] = state[0];
ptrWord[1] = state[1];
ptrWord[2] = state[2];
ptrWord[3] = state[3];
ptrWord[4] = state[4];
ptrWord[5] = state[5];
ptrWord[6] = state[6];
ptrWord[7] = state[7];
ptrWord[8] = state[8];
ptrWord[9] = state[9];
ptrWord[10] = state[10];
ptrWord[11] = state[11];
/*
for (int i = 0; i<12; i++) {
printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32),
(uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32));
}
*/
//Goes to next block (column) that will receive the squeezed data
ptrWord -= BLOCK_LEN_INT64;
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
}
}
/**
* Performs a reduced duplex operation for a single row, from the highest to
* the lowest index, using the reduced-round Blake2b's G function as the
* internal permutation
*
* @param state The current state of the sponge
* @param rowIn Row to feed the sponge
* @param rowOut Row to receive the sponge's output
*/
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev][col]"
state[0] ^= (ptrWordIn[0]);
state[1] ^= (ptrWordIn[1]);
state[2] ^= (ptrWordIn[2]);
state[3] ^= (ptrWordIn[3]);
state[4] ^= (ptrWordIn[4]);
state[5] ^= (ptrWordIn[5]);
state[6] ^= (ptrWordIn[6]);
state[7] ^= (ptrWordIn[7]);
state[8] ^= (ptrWordIn[8]);
state[9] ^= (ptrWordIn[9]);
state[10] ^= (ptrWordIn[10]);
state[11] ^= (ptrWordIn[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row][C-1-col] = M[prev][col] XOR rand
ptrWordOut[0] = ptrWordIn[0] ^ state[0];
ptrWordOut[1] = ptrWordIn[1] ^ state[1];
ptrWordOut[2] = ptrWordIn[2] ^ state[2];
ptrWordOut[3] = ptrWordIn[3] ^ state[3];
ptrWordOut[4] = ptrWordIn[4] ^ state[4];
ptrWordOut[5] = ptrWordIn[5] ^ state[5];
ptrWordOut[6] = ptrWordIn[6] ^ state[6];
ptrWordOut[7] = ptrWordIn[7] ^ state[7];
ptrWordOut[8] = ptrWordIn[8] ^ state[8];
ptrWordOut[9] = ptrWordIn[9] ^ state[9];
ptrWordOut[10] = ptrWordIn[10] ^ state[10];
ptrWordOut[11] = ptrWordIn[11] ^ state[11];
//Input: next column (i.e., next block in sequence)
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
}
}
/**
* Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
* the wordwise addition of two columns, ignoring carries between words). The
* output of this operation, "rand", is then used to make
* "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and
* "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left and N_COLS is a system parameter.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row][col] = M[prev][col] XOR rand
ptrWordOut[0] = ptrWordIn[0] ^ state[0];
ptrWordOut[1] = ptrWordIn[1] ^ state[1];
ptrWordOut[2] = ptrWordIn[2] ^ state[2];
ptrWordOut[3] = ptrWordIn[3] ^ state[3];
ptrWordOut[4] = ptrWordIn[4] ^ state[4];
ptrWordOut[5] = ptrWordIn[5] ^ state[5];
ptrWordOut[6] = ptrWordIn[6] ^ state[6];
ptrWordOut[7] = ptrWordIn[7] ^ state[7];
ptrWordOut[8] = ptrWordIn[8] ^ state[8];
ptrWordOut[9] = ptrWordIn[9] ^ state[9];
ptrWordOut[10] = ptrWordIn[10] ^ state[10];
ptrWordOut[11] = ptrWordIn[11] ^ state[11];
//M[row*][col] = M[row*][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[11];
ptrWordInOut[1] ^= state[0];
ptrWordInOut[2] ^= state[1];
ptrWordInOut[3] ^= state[2];
ptrWordInOut[4] ^= state[3];
ptrWordInOut[5] ^= state[4];
ptrWordInOut[6] ^= state[5];
ptrWordInOut[7] ^= state[6];
ptrWordInOut[8] ^= state[7];
ptrWordInOut[9] ^= state[8];
ptrWordInOut[10] ^= state[9];
ptrWordInOut[11] ^= state[10];
//Inputs: next column (i.e., next block in sequence)
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
//Output: goes to previous column
ptrWordOut -= BLOCK_LEN_INT64;
}
}
/**
* Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
* the wordwise addition of two columns, ignoring carries between words). The
* output of this operation, "rand", is then used to make
* "M[rowOut][col] = M[rowOut][col] XOR rand" and
* "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[prev] [+] M[row*]"
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]);
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]);
state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]);
state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]);
state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]);
state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]);
state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]);
state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]);
state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]);
state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]);
state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[rowOut][col] = M[rowOut][col] XOR rand
ptrWordOut[0] ^= state[0];
ptrWordOut[1] ^= state[1];
ptrWordOut[2] ^= state[2];
ptrWordOut[3] ^= state[3];
ptrWordOut[4] ^= state[4];
ptrWordOut[5] ^= state[5];
ptrWordOut[6] ^= state[6];
ptrWordOut[7] ^= state[7];
ptrWordOut[8] ^= state[8];
ptrWordOut[9] ^= state[9];
ptrWordOut[10] ^= state[10];
ptrWordOut[11] ^= state[11];
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[11];
ptrWordInOut[1] ^= state[0];
ptrWordInOut[2] ^= state[1];
ptrWordInOut[3] ^= state[2];
ptrWordInOut[4] ^= state[3];
ptrWordInOut[5] ^= state[4];
ptrWordInOut[6] ^= state[5];
ptrWordInOut[7] ^= state[6];
ptrWordInOut[8] ^= state[7];
ptrWordInOut[9] ^= state[8];
ptrWordInOut[10] ^= state[9];
ptrWordInOut[11] ^= state[10];
//Goes to next block
ptrWordOut += BLOCK_LEN_INT64;
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/**
* Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand"
* on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
/*
inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[rowInOut] XOR M[rowIn]"
state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0];
state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1];
state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2];
state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3];
state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4];
state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5];
state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6];
state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7];
state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8];
state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9];
state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10];
state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11];
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row][col] = rand
ptrWordOut[0] = state[0];
ptrWordOut[1] = state[1];
ptrWordOut[2] = state[2];
ptrWordOut[3] = state[3];
ptrWordOut[4] = state[4];
ptrWordOut[5] = state[5];
ptrWordOut[6] = state[6];
ptrWordOut[7] = state[7];
ptrWordOut[8] = state[8];
ptrWordOut[9] = state[9];
ptrWordOut[10] = state[10];
ptrWordOut[11] = state[11];
//M[row*][col] = M[row*][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[10];
ptrWordInOut[1] ^= state[11];
ptrWordInOut[2] ^= state[0];
ptrWordInOut[3] ^= state[1];
ptrWordInOut[4] ^= state[2];
ptrWordInOut[5] ^= state[3];
ptrWordInOut[6] ^= state[4];
ptrWordInOut[7] ^= state[5];
ptrWordInOut[8] ^= state[6];
ptrWordInOut[9] ^= state[7];
ptrWordInOut[10] ^= state[8];
ptrWordInOut[11] ^= state[9];
//Goes to next column (i.e., next block in sequence)
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
ptrWordOut += BLOCK_LEN_INT64;
}
}
*/
/**
* Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand"
* on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
/*
inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[rowInOut] XOR M[rowIn]"
state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row*][col] = M[row*][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[10];
ptrWordInOut[1] ^= state[11];
ptrWordInOut[2] ^= state[0];
ptrWordInOut[3] ^= state[1];
ptrWordInOut[4] ^= state[2];
ptrWordInOut[5] ^= state[3];
ptrWordInOut[6] ^= state[4];
ptrWordInOut[7] ^= state[5];
ptrWordInOut[8] ^= state[6];
ptrWordInOut[9] ^= state[7];
ptrWordInOut[10] ^= state[8];
ptrWordInOut[11] ^= state[9];
//M[row][col] = rand
ptrWordOut[0] = state[0] ^ ptrWordIn[0];
ptrWordOut[1] = state[1] ^ ptrWordIn[1];
ptrWordOut[2] = state[2] ^ ptrWordIn[2];
ptrWordOut[3] = state[3] ^ ptrWordIn[3];
ptrWordOut[4] = state[4] ^ ptrWordIn[4];
ptrWordOut[5] = state[5] ^ ptrWordIn[5];
ptrWordOut[6] = state[6] ^ ptrWordIn[6];
ptrWordOut[7] = state[7] ^ ptrWordIn[7];
ptrWordOut[8] = state[8] ^ ptrWordIn[8];
ptrWordOut[9] = state[9] ^ ptrWordIn[9];
ptrWordOut[10] = state[10] ^ ptrWordIn[10];
ptrWordOut[11] = state[11] ^ ptrWordIn[11];
//Goes to next column (i.e., next block in sequence)
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
ptrWordOut += BLOCK_LEN_INT64;
}
}
*/
/**
* Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand"
* on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
* rotation to the left.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
/*
inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordOut = rowOut;
int i;
for (i = 0; i < N_COLS / 2; i++) {
//Absorbing "M[rowInOut] XOR M[rowIn]"
state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row*][col] = M[row*][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[10];
ptrWordInOut[1] ^= state[11];
ptrWordInOut[2] ^= state[0];
ptrWordInOut[3] ^= state[1];
ptrWordInOut[4] ^= state[2];
ptrWordInOut[5] ^= state[3];
ptrWordInOut[6] ^= state[4];
ptrWordInOut[7] ^= state[5];
ptrWordInOut[8] ^= state[6];
ptrWordInOut[9] ^= state[7];
ptrWordInOut[10] ^= state[8];
ptrWordInOut[11] ^= state[9];
//M[row][col] = rand
ptrWordOut[0] = state[0] ^ ptrWordIn[0];
ptrWordOut[1] = state[1] ^ ptrWordIn[1];
ptrWordOut[2] = state[2] ^ ptrWordIn[2];
ptrWordOut[3] = state[3] ^ ptrWordIn[3];
ptrWordOut[4] = state[4] ^ ptrWordIn[4];
ptrWordOut[5] = state[5] ^ ptrWordIn[5];
ptrWordOut[6] = state[6] ^ ptrWordIn[6];
ptrWordOut[7] = state[7] ^ ptrWordIn[7];
ptrWordOut[8] = state[8] ^ ptrWordIn[8];
ptrWordOut[9] = state[9] ^ ptrWordIn[9];
ptrWordOut[10] = state[10] ^ ptrWordIn[10];
ptrWordOut[11] = state[11] ^ ptrWordIn[11];
//Goes to next column (i.e., next block in sequence)
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
ptrWordOut += 2 * BLOCK_LEN_INT64;
}
ptrWordOut = rowOut + BLOCK_LEN_INT64;
for (i = 0; i < N_COLS / 2; i++) {
//Absorbing "M[rowInOut] XOR M[rowIn]"
state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[row*][col] = M[row*][col] XOR rotW(rand)
ptrWordInOut[0] ^= state[10];
ptrWordInOut[1] ^= state[11];
ptrWordInOut[2] ^= state[0];
ptrWordInOut[3] ^= state[1];
ptrWordInOut[4] ^= state[2];
ptrWordInOut[5] ^= state[3];
ptrWordInOut[6] ^= state[4];
ptrWordInOut[7] ^= state[5];
ptrWordInOut[8] ^= state[6];
ptrWordInOut[9] ^= state[7];
ptrWordInOut[10] ^= state[8];
ptrWordInOut[11] ^= state[9];
//M[row][col] = rand
ptrWordOut[0] = state[0] ^ ptrWordIn[0];
ptrWordOut[1] = state[1] ^ ptrWordIn[1];
ptrWordOut[2] = state[2] ^ ptrWordIn[2];
ptrWordOut[3] = state[3] ^ ptrWordIn[3];
ptrWordOut[4] = state[4] ^ ptrWordIn[4];
ptrWordOut[5] = state[5] ^ ptrWordIn[5];
ptrWordOut[6] = state[6] ^ ptrWordIn[6];
ptrWordOut[7] = state[7] ^ ptrWordIn[7];
ptrWordOut[8] = state[8] ^ ptrWordIn[8];
ptrWordOut[9] = state[9] ^ ptrWordIn[9];
ptrWordOut[10] = state[10] ^ ptrWordIn[10];
ptrWordOut[11] = state[11] ^ ptrWordIn[11];
//Goes to next column (i.e., next block in sequence)
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
ptrWordOut += 2 * BLOCK_LEN_INT64;
}
}
*/
/**
* Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand"
* to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)",
* where rotW is a 64-bit rotation to the left.
*
* @param state The current state of the sponge
* @param rowIn Row used only as input
* @param rowInOut Row used as input and to receive output after rotation
* @param rowOut Row receiving the output
*
*/
/*
inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
int i;
for (i = 0; i < N_COLS; i++) {
//Absorbing "M[rowInOut] XOR M[rowIn]"
state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
//Applies the reduced-round transformation f to the sponge's state
reducedBlake2bLyra(state);
//M[rowOut][col] = M[rowOut][col] XOR rand
ptrWordOut[0] ^= state[0];
ptrWordOut[1] ^= state[1];
ptrWordOut[2] ^= state[2];
ptrWordOut[3] ^= state[3];
ptrWordOut[4] ^= state[4];
ptrWordOut[5] ^= state[5];
ptrWordOut[6] ^= state[6];
ptrWordOut[7] ^= state[7];
ptrWordOut[8] ^= state[8];
ptrWordOut[9] ^= state[9];
ptrWordOut[10] ^= state[10];
ptrWordOut[11] ^= state[11];
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
//Goes to next block
ptrWordOut += BLOCK_LEN_INT64;
ptrWordInOut += BLOCK_LEN_INT64;
ptrWordIn += BLOCK_LEN_INT64;
}
}
*/
/**
Prints an array of unsigned chars
*/
void printArray(unsigned char *array, unsigned int size, char *name) {
int i;
printf("%s: ", name);
for (i = 0; i < size; i++) {
printf("%2x|", array[i]);
}
printf("\n");
}
////////////////////////////////////////////////////////////////////////////////////////////////

108
lyra2/Sponge.h

@ -0,0 +1,108 @@
/**
* Header file for Blake2b's internal permutation in the form of a sponge.
* This code is based on the original Blake2b's implementation provided by
* Samuel Neves (https://blake2.net/)
*
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
*
* This software is hereby placed in the public domain.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef SPONGE_H_
#define SPONGE_H_
#include <stdint.h>
#if defined(__GNUC__)
#define ALIGN __attribute__ ((aligned(32)))
#elif defined(_MSC_VER)
#define ALIGN __declspec(align(32))
#else
#define ALIGN
#endif
/*Blake2b IV Array*/
static const uint64_t blake2b_IV[8] =
{
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};
/*Blake2b's rotation*/
static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){
return ( w >> c ) | ( w << ( 64 - c ) );
}
/*Blake2b's G function*/
#define G(r,i,a,b,c,d) \
do { \
a = a + b; \
d = rotr64(d ^ a, 32); \
c = c + d; \
b = rotr64(b ^ c, 24); \
a = a + b; \
d = rotr64(d ^ a, 16); \
c = c + d; \
b = rotr64(b ^ c, 63); \
} while(0)
/*One Round of the Blake2b's compression function*/
#define ROUND_LYRA(r) \
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
G(r,2,v[ 2],v[ 6],v[10],v[14]); \
G(r,3,v[ 3],v[ 7],v[11],v[15]); \
G(r,4,v[ 0],v[ 5],v[10],v[15]); \
G(r,5,v[ 1],v[ 6],v[11],v[12]); \
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
//---- Housekeeping
void initState(uint64_t state[/*16*/]);
//---- Squeezes
void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
void reducedSqueezeRow0(uint64_t* state, uint64_t* row);
//---- Absorbs
void absorbBlock(uint64_t *state, const uint64_t *in);
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
//---- Duplexes
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
//---- Misc
void printArray(unsigned char *array, unsigned int size, char *name);
////////////////////////////////////////////////////////////////////////////////////////////////
////TESTS////
//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
/////////////
#endif /* SPONGE_H_ */

536
lyra2/cuda_lyra2.cu

@ -0,0 +1,536 @@
#include <memory.h>
#include "cuda_helper.h"
static __constant__ uint2 blake2b_IV[8] = {
{ 0xf3bcc908, 0x6a09e667 },
{ 0x84caa73b, 0xbb67ae85 },
{ 0xfe94f82b, 0x3c6ef372 },
{ 0x5f1d36f1, 0xa54ff53a },
{ 0xade682d1, 0x510e527f },
{ 0x2b3e6c1f, 0x9b05688c },
{ 0xfb41bd6b, 0x1f83d9ab },
{ 0x137e2179, 0x5be0cd19 }
};
// data: 0-4 outputhash 4-8 outputhash 8-16 basil
#define reduceDuplexRowSetup(rowIn, rowInOut, rowOut) { \
for (int i = 0; i < 8; i++) { \
for (int j = 0; j < 12; j++) \
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
round_lyra_v35(state); \
for (int j = 0; j < 12; j++) \
Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
} \
}
#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \
for (int i = 0; i < 8; i++) { \
for (int j = 0; j < 12; j++) \
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
round_lyra_v35(state); \
for (int j = 0; j < 12; j++) \
Matrix[j + 12 * i][rowOut] ^= state[j]; \
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
} \
}
#define absorbblock(in) { \
state[0] ^= Matrix[0][in]; \
state[1] ^= Matrix[1][in]; \
state[2] ^= Matrix[2][in]; \
state[3] ^= Matrix[3][in]; \
state[4] ^= Matrix[4][in]; \
state[5] ^= Matrix[5][in]; \
state[6] ^= Matrix[6][in]; \
state[7] ^= Matrix[7][in]; \
state[8] ^= Matrix[8][in]; \
state[9] ^= Matrix[9][in]; \
state[10] ^= Matrix[10][in]; \
state[11] ^= Matrix[11][in]; \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
}
//// test version
#define reduceDuplexRowSetup_test(rowIn, rowInOut, rowOut) { \
for (int i = 0; i < 8; i++) { \
for (int j = 0; j < 12; j++) \
state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \
round_lyra_v35(state); \
for (int j = 0; j < 12; j++) \
Matrix[j][7-i][rowOut] = Matrix[j][i][rowIn] ^ state[j]; \
Matrix[0][i][rowInOut] ^= state[11]; \
Matrix[1][i][rowInOut] ^= state[0]; \
Matrix[2][i][rowInOut] ^= state[1]; \
Matrix[3][i][rowInOut] ^= state[2]; \
Matrix[4][i][rowInOut] ^= state[3]; \
Matrix[5][i][rowInOut] ^= state[4]; \
Matrix[6][i][rowInOut] ^= state[5]; \
Matrix[7][i][rowInOut] ^= state[6]; \
Matrix[8][i][rowInOut] ^= state[7]; \
Matrix[9][i][rowInOut] ^= state[8]; \
Matrix[10][i][rowInOut] ^= state[9]; \
Matrix[11][i][rowInOut] ^= state[10]; \
} \
}
#define reduceDuplexRow_test(rowIn, rowInOut, rowOut) { \
for (int i = 0; i < 8; i++) { \
for (int j = 0; j < 12; j++) \
state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \
round_lyra_v35(state); \
for (int j = 0; j < 12; j++) \
Matrix[j][i][rowOut] ^= state[j]; \
Matrix[0][i][rowInOut] ^= state[11]; \
Matrix[1][i][rowInOut] ^= state[0]; \
Matrix[2][i][rowInOut] ^= state[1]; \
Matrix[3][i][rowInOut] ^= state[2]; \
Matrix[4][i][rowInOut] ^= state[3]; \
Matrix[5][i][rowInOut] ^= state[4]; \
Matrix[6][i][rowInOut] ^= state[5]; \
Matrix[7][i][rowInOut] ^= state[6]; \
Matrix[8][i][rowInOut] ^= state[7]; \
Matrix[9][i][rowInOut] ^= state[8]; \
Matrix[10][i][rowInOut] ^= state[9]; \
Matrix[11][i][rowInOut] ^= state[10]; \
} \
}
#define absorbblock_test(in) { \
state[0] ^= Matrix[0][0][ in]; \
state[1] ^= Matrix[1][0][in]; \
state[2] ^= Matrix[2][0][in]; \
state[3] ^= Matrix[3][0][in]; \
state[4] ^= Matrix[4][0][in]; \
state[5] ^= Matrix[5][0][in]; \
state[6] ^= Matrix[6][0][in]; \
state[7] ^= Matrix[7][0][in]; \
state[8] ^= Matrix[8][0][in]; \
state[9] ^= Matrix[9][0][in]; \
state[10] ^= Matrix[10][0][in]; \
state[11] ^= Matrix[11][0][in]; \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
round_lyra_v35(state); \
}
//// compute 30 version
#define reduceDuplexRowSetup_v30(rowIn, rowInOut, rowOut) { \
for (int i = 0; i < 8; i++) { \
for (int j = 0; j < 12; j++) \
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
round_lyra_v30(state); \
for (int j = 0; j < 12; j++) \
Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
} \
}
#define reduceDuplexRow_v30(rowIn, rowInOut, rowOut) { \
for (int i = 0; i < 8; i++) { \
for (int j = 0; j < 12; j++) \
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
round_lyra_v30(state); \
for (int j = 0; j < 12; j++) \
Matrix[j + 12 * i][rowOut] ^= state[j]; \
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
} \
}
#define absorbblock_v30(in) { \
state[0] ^= Matrix[0][in]; \
state[1] ^= Matrix[1][in]; \
state[2] ^= Matrix[2][in]; \
state[3] ^= Matrix[3][in]; \
state[4] ^= Matrix[4][in]; \
state[5] ^= Matrix[5][in]; \
state[6] ^= Matrix[6][in]; \
state[7] ^= Matrix[7][in]; \
state[8] ^= Matrix[8][in]; \
state[9] ^= Matrix[9][in]; \
state[10] ^= Matrix[10][in]; \
state[11] ^= Matrix[11][in]; \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
round_lyra_v30(state); \
}
static __device__ __forceinline__
void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
{
a += b; d ^= a; d = ROR2(d, 32);
c += d; b ^= c; b = ROR2(b, 24);
a += b; d ^= a; d = ROR2(d, 16);
c += d; b ^= c; b = ROR2(b, 63);
}
static __device__ __forceinline__
void Gfunc_v30(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
{
a += b; d ^= a; d = ROTR64(d, 32);
c += d; b ^= c; b = ROTR64(b, 24);
a += b; d ^= a; d = ROTR64(d, 16);
c += d; b ^= c; b = ROTR64(b, 63);
}
#define round_lyra_v35_new(state) { \
Gfunc_v35(state[0], state[4], state[8], state[12]); \
Gfunc_v35(state[1], state[5], state[9], state[13]); \
Gfunc_v35(state[2], state[6], state[10], state[14]); \
Gfunc_v35(state[3], state[7], state[11], state[15]); \
Gfunc_v35(state[0], state[5], state[10], state[15]); \
Gfunc_v35(state[1], state[6], state[11], state[12]); \
Gfunc_v35(state[2], state[7], state[8], state[13]); \
Gfunc_v35(state[3], state[4], state[9], state[14]); \
}
static __device__ __forceinline__ void round_lyra_v35(uint2 *s)
{
Gfunc_v35(s[0], s[4], s[8], s[12]);
Gfunc_v35(s[1], s[5], s[9], s[13]);
Gfunc_v35(s[2], s[6], s[10], s[14]);
Gfunc_v35(s[3], s[7], s[11], s[15]);
Gfunc_v35(s[0], s[5], s[10], s[15]);
Gfunc_v35(s[1], s[6], s[11], s[12]);
Gfunc_v35(s[2], s[7], s[8], s[13]);
Gfunc_v35(s[3], s[4], s[9], s[14]);
}
static __device__ __forceinline__ void round_lyra_v30(uint64_t *s)
{
Gfunc_v30(s[0], s[4], s[8], s[12]);
Gfunc_v30(s[1], s[5], s[9], s[13]);
Gfunc_v30(s[2], s[6], s[10], s[14]);
Gfunc_v30(s[3], s[7], s[11], s[15]);
Gfunc_v30(s[0], s[5], s[10], s[15]);
Gfunc_v30(s[1], s[6], s[11], s[12]);
Gfunc_v30(s[2], s[7], s[8], s[13]);
Gfunc_v30(s[3], s[4], s[9], s[14]);
}
__global__ __launch_bounds__(256, 1)
void lyra2_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t state[16];
#pragma unroll
for (int i = 0; i<4; i++) { state[i] = outputHash[threads*i + thread]; } //password
#pragma unroll
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
#pragma unroll
for (int i = 0; i<8; i++) { state[i + 8] = devectorize(blake2b_IV[i]); }
// blake2blyra x2
#pragma unroll 24
for (int i = 0; i<24; i++) { round_lyra_v30(state); } //because 12 is not enough
uint64_t Matrix[96][8]; // not cool
// reducedSqueezeRow0
#pragma unroll 8
for (int i = 0; i < 8; i++) {
int idx = 84-12*i;
#pragma unroll 12
for (int j = 0; j<12; j++) { Matrix[j + idx][0] = state[j]; }
round_lyra_v30(state);
}
// reducedSqueezeRow1
#pragma unroll 8
for (int i = 0; i < 8; i++)
{
int idx0= 12*i;
int idx1= 84-idx0;
#pragma unroll 12
for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + idx0][0]; }
round_lyra_v30(state);
#pragma unroll 12
for (int j = 0; j<12; j++) { Matrix[j + idx1][1] = Matrix[j + idx0][0] ^ state[j]; }
}
reduceDuplexRowSetup_v30(1, 0, 2);
reduceDuplexRowSetup_v30(2, 1, 3);
reduceDuplexRowSetup_v30(3, 0, 4);
reduceDuplexRowSetup_v30(4, 3, 5);
reduceDuplexRowSetup_v30(5, 2, 6);
reduceDuplexRowSetup_v30(6, 1, 7);
uint64_t rowa;
rowa = state[0] & 7;
reduceDuplexRow_v30(7, rowa, 0);
rowa = state[0] & 7;
reduceDuplexRow_v30(0, rowa, 3);
rowa = state[0] & 7;
reduceDuplexRow_v30(3, rowa, 6);
rowa = state[0] & 7;
reduceDuplexRow_v30(6, rowa, 1);
rowa = state[0] & 7;
reduceDuplexRow_v30(1, rowa, 4);
rowa = state[0] & 7;
reduceDuplexRow_v30(4, rowa, 7);
rowa = state[0] & 7;
reduceDuplexRow_v30(7, rowa, 2);
rowa = state[0] & 7;
reduceDuplexRow_v30(2, rowa, 5);
absorbblock_v30(rowa);
#pragma unroll
for (int i = 0; i<4; i++) {
outputHash[threads*i + thread] = state[i];
} //password
} //thread
}
__global__ __launch_bounds__(256, 1)
void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2 state[16];
#pragma unroll
for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password
#pragma unroll
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
#pragma unroll
for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
// blake2blyra x2
#pragma unroll 24
for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough
uint2 Matrix[96][8]; // not cool
// reducedSqueezeRow0
#pragma unroll 8
for (int i = 0; i < 8; i++)
{
#pragma unroll 12
for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; }
round_lyra_v35(state);
}
// reducedSqueezeRow1
#pragma unroll 8
for (int i = 0; i < 8; i++)
{
#pragma unroll 12
for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; }
round_lyra_v35(state);
#pragma unroll 12
for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; }
}
reduceDuplexRowSetup(1, 0, 2);
reduceDuplexRowSetup(2, 1, 3);
reduceDuplexRowSetup(3, 0, 4);
reduceDuplexRowSetup(4, 3, 5);
reduceDuplexRowSetup(5, 2, 6);
reduceDuplexRowSetup(6, 1, 7);
uint32_t rowa;
rowa = state[0].x & 7;
reduceDuplexRow(7, rowa, 0);
rowa = state[0].x & 7;
reduceDuplexRow(0, rowa, 3);
rowa = state[0].x & 7;
reduceDuplexRow(3, rowa, 6);
rowa = state[0].x & 7;
reduceDuplexRow(6, rowa, 1);
rowa = state[0].x & 7;
reduceDuplexRow(1, rowa, 4);
rowa = state[0].x & 7;
reduceDuplexRow(4, rowa, 7);
rowa = state[0].x & 7;
reduceDuplexRow(7, rowa, 2);
rowa = state[0].x & 7;
reduceDuplexRow(2, rowa, 5);
absorbblock(rowa);
#pragma unroll
for (int i = 0; i<4; i++) {
outputHash[threads*i + thread] = devectorize(state[i]);
} //password
} //thread
}
__global__
void __launch_bounds__(256, 1) lyra2_gpu_hash_32_test(int threads, uint32_t startNounce, uint64_t *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2 state[16];
#pragma unroll
for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password
#pragma unroll
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
#pragma unroll
for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
// blake2blyra x2
#pragma unroll 24
for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough
uint2 Matrix[12][8][8]; // not cool
// reducedSqueezeRow0
#pragma unroll 8
for (int i = 0; i < 8; i++) {
#pragma unroll 12
for (int j = 0; j<12; j++) { Matrix[j][7-i][0] = state[j]; }
round_lyra_v35(state);
}
// reducedSqueezeRow1
#pragma unroll 8
for (int i = 0; i < 8; i++)
{
#pragma unroll 12
for (int j = 0; j<12; j++) { state[j] ^= Matrix[j][i][0]; }
round_lyra_v35(state);
#pragma unroll 12
for (int j = 0; j<12; j++) { Matrix[j][7-i][1] = Matrix[j][i][0] ^ state[j]; }
}
reduceDuplexRowSetup_test(1, 0, 2);
reduceDuplexRowSetup_test(2, 1, 3);
reduceDuplexRowSetup_test(3, 0, 4);
reduceDuplexRowSetup_test(4, 3, 5);
reduceDuplexRowSetup_test(5, 2, 6);
reduceDuplexRowSetup_test(6, 1, 7);
uint64_t rowa;
rowa = devectorize(state[0]) & 7;
reduceDuplexRow_test(7, rowa, 0);
rowa = devectorize(state[0]) & 7;
reduceDuplexRow_test(0, rowa, 3);
rowa = devectorize(state[0]) & 7;
reduceDuplexRow_test(3, rowa, 6);
rowa = devectorize(state[0]) & 7;
reduceDuplexRow_test(6, rowa, 1);
rowa = devectorize(state[0]) & 7;
reduceDuplexRow_test(1, rowa, 4);
rowa = devectorize(state[0]) & 7;
reduceDuplexRow_test(4, rowa, 7);
rowa = devectorize(state[0]) & 7;
reduceDuplexRow_test(7, rowa, 2);
rowa = devectorize(state[0]) & 7;
reduceDuplexRow_test(2, rowa, 5);
absorbblock_test(rowa);
#pragma unroll
for (int i = 0; i<4; i++) {
outputHash[threads*i + thread] = devectorize(state[i]);
} //password
} //thread
}
__host__
void lyra2_cpu_init(int thr_id, int threads)
{
//not used
}
__host__
void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
{
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
if (device_sm[device_map[thr_id]] >= 350) {
lyra2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
} else {
// kernel for compute30 card
lyra2_gpu_hash_32_v30 <<<grid, block >>> (threads, startNounce, d_outputHash);
}
cudaDeviceSynchronize();
//MyStreamSynchronize(NULL, order, thr_id);
}

133
lyra2/lyra2RE.cu

@ -0,0 +1,133 @@
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_keccak.h"
#include "lyra2/Lyra2.h"
}
#include "miner.h"
#include "cuda_helper.h"
static _ALIGN(64) uint64_t *d_hash[8];
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern uint32_t quark_check_cpu_hash_64_2(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint64_t *d_inputHash, int order);
extern void blake256_cpu_init(int thr_id, int threads);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
extern void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_cpu_init(int thr_id, int threads);
extern void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, int threads);
extern void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void lyra2_cpu_init(int thr_id, int threads);
extern void groestl256_setTarget(const void *ptarget);
extern uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
extern void groestl256_cpu_init(int thr_id, int threads);
extern "C" void lyra_hash(void *state, const void *input)
{
sph_blake256_context ctx_blake;
sph_keccak256_context ctx_keccak;
sph_skein256_context ctx_skein;
sph_groestl256_context ctx_groestl;
uint32_t hashA[8], hashB[8], hash[8];
sph_blake256_init(&ctx_blake);
sph_blake256(&ctx_blake, input, 80);
sph_blake256_close(&ctx_blake, hashA);
sph_keccak256_init(&ctx_keccak);
sph_keccak256(&ctx_keccak, hashA, 32);
sph_keccak256_close(&ctx_keccak, hashB);
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
sph_skein256_init(&ctx_skein);
sph_skein256(&ctx_skein, hashA, 32);
sph_skein256_close(&ctx_skein, hashB);
sph_groestl256_init(&ctx_groestl);
sph_groestl256(&ctx_groestl, hashB, 32);
sph_groestl256_close(&ctx_groestl, hash);
// seems wrong : hash or hashB ?
memcpy(state, hashB, 32);
}
static bool init[8] = { 0 };
extern "C" int scanhash_lyra(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500) ? 19 : 18;
int throughput = opt_work_size ? opt_work_size : (1 << intensity); // 18=256*256*4;
throughput = min(throughput, (int)(max_nonce - first_nonce));
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
blake256_cpu_init(thr_id, throughput);
keccak256_cpu_init(thr_id,throughput);
skein256_cpu_init(thr_id, throughput);
groestl256_cpu_init(thr_id, throughput);
lyra2_cpu_init(thr_id, throughput);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
blake256_cpu_setBlock_80(pdata);
groestl256_setTarget(ptarget);
do {
int order = 0;
uint32_t foundNonce;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
// const uint32_t Htarg = ptarget[6];
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
lyra_hash(vhash64, endiandata);
// if (vhash64[7]<=Htarg) { // && fulltest(vhash64, ptarget)) {
*hashes_done = pdata[19] - first_nonce + throughput;
pdata[19] = foundNonce;
return 1;
// } else {
// applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
// }
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}

5
miner.h

@ -328,6 +328,10 @@ extern int scanhash_fresh(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
extern int scanhash_lyra(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_nist5(int thr_id, uint32_t *pdata, extern int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
@ -645,6 +649,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void keccak256_hash(void *state, const void *input); void keccak256_hash(void *state, const void *input);
unsigned int jackpothash(void *state, const void *input); unsigned int jackpothash(void *state, const void *input);
void groestlhash(void *state, const void *input); void groestlhash(void *state, const void *input);
void lyra_hash(void *state, const void *input);
void myriadhash(void *state, const void *input); void myriadhash(void *state, const void *input);
void nist5hash(void *state, const void *input); void nist5hash(void *state, const void *input);
void pentablakehash(void *output, const void *input); void pentablakehash(void *output, const void *input);

14
util.cpp

@ -1633,18 +1633,22 @@ void print_hash_tests(void)
heavycoin_hash(&hash[0], &buf[0], 32); heavycoin_hash(&hash[0], &buf[0], 32);
printpfx("heavy", hash); printpfx("heavy", hash);
memset(hash, 0, sizeof hash);
keccak256_hash(&hash[0], &buf[0]);
printpfx("keccak", hash);
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
jackpothash(&hash[0], &buf[0]); jackpothash(&hash[0], &buf[0]);
printpfx("jackpot", hash); printpfx("jackpot", hash);
memset(hash, 0, sizeof hash);
keccak256_hash(&hash[0], &buf[0]);
printpfx("keccak", hash);
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
doomhash(&hash[0], &buf[0]); doomhash(&hash[0], &buf[0]);
printpfx("luffa", hash); printpfx("luffa", hash);
/* to double check with a lyra2 cpu miner
memset(hash, 0, sizeof hash);
lyra_hash(&hash[0], &buf[0]);
printpfx("lyra2", hash);
*/
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
myriadhash(&hash[0], &buf[0]); myriadhash(&hash[0], &buf[0]);
printpfx("myriad", hash); printpfx("myriad", hash);

Loading…
Cancel
Save