Browse Source
Seems to be djm34 work, i recognize the code style ;) Code was cleaned/indented and adapted to my fork... Only usable on the test pool until 16 december 2014!master
Tanguy Pruvot
10 years ago
21 changed files with 2900 additions and 59 deletions
@ -0,0 +1,250 @@
@@ -0,0 +1,250 @@
|
||||
/** |
||||
* Blake-256 Cuda Kernel (Tested on SM 5.0) |
||||
* |
||||
* Tanguy Pruvot - Nov. 2014 |
||||
*/ |
||||
extern "C" { |
||||
#include "sph/sph_blake.h" |
||||
} |
||||
|
||||
#include "cuda_helper.h" |
||||
|
||||
#include <memory.h> |
||||
|
||||
static __device__ uint64_t cuda_swab32ll(uint64_t x) { |
||||
return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x))); |
||||
} |
||||
|
||||
__constant__ static uint32_t c_data[20]; |
||||
|
||||
__constant__ static uint32_t sigma[16][16]; |
||||
static uint32_t c_sigma[16][16] = { |
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, |
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, |
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, |
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, |
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, |
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, |
||||
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, |
||||
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, |
||||
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, |
||||
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, |
||||
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, |
||||
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, |
||||
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, |
||||
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, |
||||
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, |
||||
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } |
||||
}; |
||||
|
||||
static const uint32_t c_IV256[8] = { |
||||
0x6A09E667, 0xBB67AE85, |
||||
0x3C6EF372, 0xA54FF53A, |
||||
0x510E527F, 0x9B05688C, |
||||
0x1F83D9AB, 0x5BE0CD19 |
||||
}; |
||||
|
||||
__device__ __constant__ static uint32_t cpu_h[8]; |
||||
|
||||
__device__ __constant__ static uint32_t u256[16]; |
||||
static const uint32_t c_u256[16] = { |
||||
0x243F6A88, 0x85A308D3, |
||||
0x13198A2E, 0x03707344, |
||||
0xA4093822, 0x299F31D0, |
||||
0x082EFA98, 0xEC4E6C89, |
||||
0x452821E6, 0x38D01377, |
||||
0xBE5466CF, 0x34E90C6C, |
||||
0xC0AC29B7, 0xC97C50DD, |
||||
0x3F84D5B5, 0xB5470917 |
||||
}; |
||||
|
||||
#define GS2(a,b,c,d,x) { \ |
||||
const uint32_t idx1 = sigma[r][x]; \ |
||||
const uint32_t idx2 = sigma[r][x+1]; \ |
||||
v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \ |
||||
v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \ |
||||
v[c] += v[d]; \ |
||||
v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ |
||||
\ |
||||
v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \ |
||||
v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \ |
||||
v[c] += v[d]; \ |
||||
v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ |
||||
} |
||||
|
||||
//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) |
||||
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) |
||||
#define hostGS(a,b,c,d,x) { \ |
||||
const uint32_t idx1 = c_sigma[r][x]; \ |
||||
const uint32_t idx2 = c_sigma[r][x+1]; \ |
||||
v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ |
||||
v[d] = ROTR32(v[d] ^ v[a], 16); \ |
||||
v[c] += v[d]; \ |
||||
v[b] = ROTR32(v[b] ^ v[c], 12); \ |
||||
\ |
||||
v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \ |
||||
v[d] = ROTR32(v[d] ^ v[a], 8); \ |
||||
v[c] += v[d]; \ |
||||
v[b] = ROTR32(v[b] ^ v[c], 7); \ |
||||
} |
||||
|
||||
/* Second part (64-80) msg never change, store it */ |
||||
__device__ __constant__ static const uint32_t c_Padding[16] = { |
||||
0, 0, 0, 0, |
||||
0x80000000, 0, 0, 0, |
||||
0, 0, 0, 0, |
||||
0, 1, 0, 640, |
||||
}; |
||||
|
||||
__host__ __forceinline__ |
||||
static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0) |
||||
{ |
||||
uint32_t m[16]; |
||||
uint32_t v[16]; |
||||
|
||||
for (int i = 0; i < 16; i++) { |
||||
m[i] = block[i]; |
||||
} |
||||
|
||||
for (int i = 0; i < 8; i++) |
||||
v[i] = h[i]; |
||||
|
||||
v[8] = c_u256[0]; |
||||
v[9] = c_u256[1]; |
||||
v[10] = c_u256[2]; |
||||
v[11] = c_u256[3]; |
||||
|
||||
v[12] = c_u256[4] ^ T0; |
||||
v[13] = c_u256[5] ^ T0; |
||||
v[14] = c_u256[6]; |
||||
v[15] = c_u256[7]; |
||||
|
||||
for (int r = 0; r < 14; r++) { |
||||
/* column step */ |
||||
hostGS(0, 4, 0x8, 0xC, 0x0); |
||||
hostGS(1, 5, 0x9, 0xD, 0x2); |
||||
hostGS(2, 6, 0xA, 0xE, 0x4); |
||||
hostGS(3, 7, 0xB, 0xF, 0x6); |
||||
/* diagonal step */ |
||||
hostGS(0, 5, 0xA, 0xF, 0x8); |
||||
hostGS(1, 6, 0xB, 0xC, 0xA); |
||||
hostGS(2, 7, 0x8, 0xD, 0xC); |
||||
hostGS(3, 4, 0x9, 0xE, 0xE); |
||||
} |
||||
|
||||
for (int i = 0; i < 16; i++) { |
||||
int j = i & 7; |
||||
h[j] ^= v[i]; |
||||
} |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0) |
||||
{ |
||||
uint32_t m[16]; |
||||
uint32_t v[16]; |
||||
|
||||
m[0] = block[0]; |
||||
m[1] = block[1]; |
||||
m[2] = block[2]; |
||||
m[3] = block[3]; |
||||
|
||||
#pragma unroll |
||||
for (int i = 4; i < 16; i++) { |
||||
m[i] = c_Padding[i]; |
||||
} |
||||
|
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) |
||||
v[i] = h[i]; |
||||
|
||||
v[8] = u256[0]; |
||||
v[9] = u256[1]; |
||||
v[10] = u256[2]; |
||||
v[11] = u256[3]; |
||||
|
||||
v[12] = u256[4] ^ T0; |
||||
v[13] = u256[5] ^ T0; |
||||
v[14] = u256[6]; |
||||
v[15] = u256[7]; |
||||
|
||||
#pragma unroll 14 |
||||
for (int r = 0; r < 14; r++) { |
||||
/* column step */ |
||||
GS2(0, 4, 0x8, 0xC, 0x0); |
||||
GS2(1, 5, 0x9, 0xD, 0x2); |
||||
GS2(2, 6, 0xA, 0xE, 0x4); |
||||
GS2(3, 7, 0xB, 0xF, 0x6); |
||||
/* diagonal step */ |
||||
GS2(0, 5, 0xA, 0xF, 0x8); |
||||
GS2(1, 6, 0xB, 0xC, 0xA); |
||||
GS2(2, 7, 0x8, 0xD, 0xC); |
||||
GS2(3, 4, 0x9, 0xE, 0xE); |
||||
} |
||||
|
||||
#pragma unroll 16 |
||||
for (int i = 0; i < 16; i++) { |
||||
int j = i & 7; |
||||
h[j] ^= v[i]; |
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(256,3) |
||||
void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
const uint32_t nonce = startNonce + thread; |
||||
uint32_t h[8]; |
||||
uint32_t input[4]; |
||||
|
||||
#pragma unroll 8 |
||||
for (int i = 0; i<8; i++) { h[i] = cpu_h[i];} |
||||
|
||||
#pragma unroll 3 |
||||
for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i]; |
||||
|
||||
input[3] = nonce; |
||||
blake256_compress2nd(h, input, 640); |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { |
||||
Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1])); |
||||
} |
||||
} |
||||
} |
||||
|
||||
__host__ |
||||
void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) |
||||
{ |
||||
const int threadsperblock = 256; |
||||
|
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
|
||||
blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash); |
||||
MyStreamSynchronize(NULL, order, thr_id); |
||||
} |
||||
|
||||
__host__ |
||||
void blake256_cpu_setBlock_80(uint32_t *pdata) |
||||
{ |
||||
uint32_t h[8]; |
||||
uint32_t data[20]; |
||||
memcpy(data, pdata, 80); |
||||
for (int i = 0; i<8; i++) { |
||||
h[i] = c_IV256[i]; |
||||
} |
||||
blake256_compress1st(h, pdata, 512); |
||||
|
||||
cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice); |
||||
cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice); |
||||
} |
||||
|
||||
__host__ |
||||
void blake256_cpu_init(int thr_id, int threads) |
||||
{ |
||||
cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice); |
||||
cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice); |
||||
} |
@ -0,0 +1,309 @@
@@ -0,0 +1,309 @@
|
||||
#include <memory.h> |
||||
|
||||
#include "cuda_helper.h" |
||||
|
||||
uint32_t *d_gnounce[8]; |
||||
uint32_t *d_GNonce[8]; |
||||
|
||||
__constant__ uint32_t pTarget[8]; |
||||
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U)) |
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) |
||||
|
||||
#define C32e(x) \ |
||||
((SPH_C32(x) >> 24) \ |
||||
| ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ |
||||
| ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ |
||||
| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) |
||||
|
||||
#define PC32up(j, r) ((uint32_t)((j) + (r))) |
||||
#define PC32dn(j, r) 0 |
||||
#define QC32up(j, r) 0xFFFFFFFF |
||||
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24))) |
||||
|
||||
#define B32_0(x) __byte_perm(x, 0, 0x4440) |
||||
//((x) & 0xFF) |
||||
#define B32_1(x) __byte_perm(x, 0, 0x4441) |
||||
//(((x) >> 8) & 0xFF) |
||||
#define B32_2(x) __byte_perm(x, 0, 0x4442) |
||||
//(((x) >> 16) & 0xFF) |
||||
#define B32_3(x) __byte_perm(x, 0, 0x4443) |
||||
//((x) >> 24) |
||||
|
||||
#define MAXWELL_OR_FERMI 1 |
||||
#if MAXWELL_OR_FERMI |
||||
#define USE_SHARED 1 |
||||
// Maxwell and Fermi cards get the best speed with SHARED access it seems. |
||||
#if USE_SHARED |
||||
#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) |
||||
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x)))) |
||||
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x)))) |
||||
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) |
||||
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x)))) |
||||
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) |
||||
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) |
||||
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x)))) |
||||
#else |
||||
#define T0up(x) tex1Dfetch(t0up2, x) |
||||
#define T0dn(x) tex1Dfetch(t0dn2, x) |
||||
#define T1up(x) tex1Dfetch(t1up2, x) |
||||
#define T1dn(x) tex1Dfetch(t1dn2, x) |
||||
#define T2up(x) tex1Dfetch(t2up2, x) |
||||
#define T2dn(x) tex1Dfetch(t2dn2, x) |
||||
#define T3up(x) tex1Dfetch(t3up2, x) |
||||
#define T3dn(x) tex1Dfetch(t3dn2, x) |
||||
#endif |
||||
#else |
||||
#define USE_SHARED 1 |
||||
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5! |
||||
#define T0up(x) (*((uint32_t*)mixtabs + ( (x)))) |
||||
#define T0dn(x) tex1Dfetch(t0dn2, x) |
||||
#define T1up(x) tex1Dfetch(t1up2, x) |
||||
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x)))) |
||||
#define T2up(x) tex1Dfetch(t2up2, x) |
||||
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x)))) |
||||
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x)))) |
||||
#define T3dn(x) tex1Dfetch(t3dn2, x) |
||||
#endif |
||||
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t0up2; |
||||
texture<unsigned int, 1, cudaReadModeElementType> t0dn2; |
||||
texture<unsigned int, 1, cudaReadModeElementType> t1up2; |
||||
texture<unsigned int, 1, cudaReadModeElementType> t1dn2; |
||||
texture<unsigned int, 1, cudaReadModeElementType> t2up2; |
||||
texture<unsigned int, 1, cudaReadModeElementType> t2dn2; |
||||
texture<unsigned int, 1, cudaReadModeElementType> t3up2; |
||||
texture<unsigned int, 1, cudaReadModeElementType> t3dn2; |
||||
|
||||
#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ |
||||
t[d0] = T0up(B32_0(a[b0])) \ |
||||
^ T1up(B32_1(a[b1])) \ |
||||
^ T2up(B32_2(a[b2])) \ |
||||
^ T3up(B32_3(a[b3])) \ |
||||
^ T0dn(B32_0(a[b4])) \ |
||||
^ T1dn(B32_1(a[b5])) \ |
||||
^ T2dn(B32_2(a[b6])) \ |
||||
^ T3dn(B32_3(a[b7])); \ |
||||
t[d1] = T0dn(B32_0(a[b0])) \ |
||||
^ T1dn(B32_1(a[b1])) \ |
||||
^ T2dn(B32_2(a[b2])) \ |
||||
^ T3dn(B32_3(a[b3])) \ |
||||
^ T0up(B32_0(a[b4])) \ |
||||
^ T1up(B32_1(a[b5])) \ |
||||
^ T2up(B32_2(a[b6])) \ |
||||
^ T3up(B32_3(a[b7])); \ |
||||
} while (0) |
||||
|
||||
|
||||
extern uint32_t T0up_cpu[]; |
||||
extern uint32_t T0dn_cpu[]; |
||||
extern uint32_t T1up_cpu[]; |
||||
extern uint32_t T1dn_cpu[]; |
||||
extern uint32_t T2up_cpu[]; |
||||
extern uint32_t T2dn_cpu[]; |
||||
extern uint32_t T3up_cpu[]; |
||||
extern uint32_t T3dn_cpu[]; |
||||
|
||||
__device__ __forceinline__ |
||||
void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs) |
||||
{ |
||||
#pragma unroll 10 |
||||
for (int r = 0; r<10; r++) |
||||
{ |
||||
uint32_t t[16]; |
||||
|
||||
a[0x0] ^= PC32up(0x00, r); |
||||
a[0x2] ^= PC32up(0x10, r); |
||||
a[0x4] ^= PC32up(0x20, r); |
||||
a[0x6] ^= PC32up(0x30, r); |
||||
a[0x8] ^= PC32up(0x40, r); |
||||
a[0xA] ^= PC32up(0x50, r); |
||||
a[0xC] ^= PC32up(0x60, r); |
||||
a[0xE] ^= PC32up(0x70, r); |
||||
RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); |
||||
RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); |
||||
RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); |
||||
RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); |
||||
RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); |
||||
RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); |
||||
RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); |
||||
RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); |
||||
|
||||
#pragma unroll 16 |
||||
for (int k = 0; k<16; k++) |
||||
a[k] = t[k]; |
||||
} |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs) |
||||
{ |
||||
#pragma unroll |
||||
for (int r = 0; r<10; r++) |
||||
{ |
||||
uint32_t t[16]; |
||||
|
||||
a[0x0] ^= QC32up(0x00, r); |
||||
a[0x1] ^= QC32dn(0x00, r); |
||||
a[0x2] ^= QC32up(0x10, r); |
||||
a[0x3] ^= QC32dn(0x10, r); |
||||
a[0x4] ^= QC32up(0x20, r); |
||||
a[0x5] ^= QC32dn(0x20, r); |
||||
a[0x6] ^= QC32up(0x30, r); |
||||
a[0x7] ^= QC32dn(0x30, r); |
||||
a[0x8] ^= QC32up(0x40, r); |
||||
a[0x9] ^= QC32dn(0x40, r); |
||||
a[0xA] ^= QC32up(0x50, r); |
||||
a[0xB] ^= QC32dn(0x50, r); |
||||
a[0xC] ^= QC32up(0x60, r); |
||||
a[0xD] ^= QC32dn(0x60, r); |
||||
a[0xE] ^= QC32up(0x70, r); |
||||
a[0xF] ^= QC32dn(0x70, r); |
||||
RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); |
||||
RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); |
||||
RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); |
||||
RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); |
||||
RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); |
||||
RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); |
||||
RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); |
||||
RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); |
||||
|
||||
#pragma unroll |
||||
for (int k = 0; k<16; k++) |
||||
a[k] = t[k]; |
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(256,1) |
||||
void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector) |
||||
{ |
||||
#if USE_SHARED |
||||
extern __shared__ char mixtabs[]; |
||||
|
||||
if (threadIdx.x < 256) { |
||||
*((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(t0up2, threadIdx.x); |
||||
*((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(t0dn2, threadIdx.x); |
||||
*((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(t1up2, threadIdx.x); |
||||
*((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(t1dn2, threadIdx.x); |
||||
*((uint32_t*)mixtabs + (1024 + threadIdx.x)) = tex1Dfetch(t2up2, threadIdx.x); |
||||
*((uint32_t*)mixtabs + (1280 + threadIdx.x)) = tex1Dfetch(t2dn2, threadIdx.x); |
||||
*((uint32_t*)mixtabs + (1536 + threadIdx.x)) = tex1Dfetch(t3up2, threadIdx.x); |
||||
*((uint32_t*)mixtabs + (1792 + threadIdx.x)) = tex1Dfetch(t3dn2, threadIdx.x); |
||||
} |
||||
|
||||
__syncthreads(); |
||||
#endif |
||||
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
// GROESTL |
||||
uint32_t message[16]; |
||||
uint32_t state[16]; |
||||
|
||||
#pragma unroll |
||||
for (int k = 0; k<4; k++) |
||||
LOHI(message[2*k], message[2*k+1], outputHash[k*threads+thread]); |
||||
|
||||
#pragma unroll |
||||
for (int k = 9; k<15; k++) |
||||
message[k] = 0; |
||||
|
||||
message[8] = 0x80; |
||||
message[15] = 0x01000000; |
||||
|
||||
#pragma unroll 16 |
||||
for (int u = 0; u<16; u++) |
||||
state[u] = message[u]; |
||||
|
||||
state[15] ^= 0x10000; |
||||
|
||||
// Perm |
||||
|
||||
#if USE_SHARED |
||||
groestl256_perm_P(thread, state, mixtabs); |
||||
state[15] ^= 0x10000; |
||||
groestl256_perm_Q(thread, message, mixtabs); |
||||
#else |
||||
groestl256_perm_P(thread, state, NULL); |
||||
state[15] ^= 0x10000; |
||||
groestl256_perm_P(thread, message, NULL); |
||||
#endif |
||||
#pragma unroll 16 |
||||
for (int u = 0; u<16; u++) state[u] ^= message[u]; |
||||
#pragma unroll 16 |
||||
for (int u = 0; u<16; u++) message[u] = state[u]; |
||||
#if USE_SHARED |
||||
groestl256_perm_P(thread, message, mixtabs); |
||||
#else |
||||
groestl256_perm_P(thread, message, NULL); |
||||
#endif |
||||
state[14] ^= message[14]; |
||||
state[15] ^= message[15]; |
||||
|
||||
uint32_t nonce = startNounce + thread; |
||||
if (state[15] <= pTarget[7]) { |
||||
nonceVector[0] = nonce; |
||||
} |
||||
} |
||||
} |
||||
|
||||
#define texDef(texname, texmem, texsource, texsize) \ |
||||
unsigned int *texmem; \ |
||||
cudaMalloc(&texmem, texsize); \ |
||||
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ |
||||
texname.normalized = 0; \ |
||||
texname.filterMode = cudaFilterModePoint; \ |
||||
texname.addressMode[0] = cudaAddressModeClamp; \ |
||||
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \ |
||||
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ |
||||
|
||||
__host__ |
||||
void groestl256_cpu_init(int thr_id, int threads) |
||||
{ |
||||
|
||||
// Texturen mit obigem Makro initialisieren |
||||
texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256); |
||||
texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256); |
||||
texDef(t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256); |
||||
texDef(t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256); |
||||
texDef(t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256); |
||||
texDef(t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256); |
||||
texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256); |
||||
texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256); |
||||
|
||||
cudaMalloc(&d_GNonce[thr_id], sizeof(uint32_t)); |
||||
cudaMallocHost(&d_gnounce[thr_id], 1*sizeof(uint32_t)); |
||||
} |
||||
|
||||
__host__ |
||||
uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) |
||||
{ |
||||
uint32_t result = 0xffffffff; |
||||
cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t)); |
||||
const int threadsperblock = 256; |
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen |
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
|
||||
#if USE_SHARED |
||||
size_t shared_size = 8 * 256 * sizeof(uint32_t); |
||||
#else |
||||
size_t shared_size = 0; |
||||
#endif |
||||
groestl256_gpu_hash32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]); |
||||
|
||||
MyStreamSynchronize(NULL, order, thr_id); |
||||
cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
cudaThreadSynchronize(); |
||||
result = *d_gnounce[thr_id]; |
||||
|
||||
return result; |
||||
} |
||||
|
||||
__host__ |
||||
void groestl256_setTarget(const void *pTargetIn) |
||||
{ |
||||
cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); |
||||
} |
@ -0,0 +1,196 @@
@@ -0,0 +1,196 @@
|
||||
#include <memory.h> |
||||
|
||||
#include "cuda_helper.h" |
||||
|
||||
#if 0 |
||||
static __constant__ uint64_t SKEIN_IV512_256[8] = { |
||||
0xCCD044A12FDB3E13, 0xE83590301A79A9EB, |
||||
0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB, |
||||
0xEC06025E74DD7683, 0xE7A436CDC4746251, |
||||
0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13 |
||||
}; |
||||
#endif |
||||
|
||||
static __constant__ uint2 vSKEIN_IV512_256[8] = { |
||||
{ 0x2FDB3E13, 0xCCD044A1 }, |
||||
{ 0x1A79A9EB, 0xE8359030 }, |
||||
{ 0x4F816E6F, 0x55AEA061 }, |
||||
{ 0xAE9B94DB, 0x2A2767A4 }, |
||||
{ 0x74DD7683, 0xEC06025E }, |
||||
{ 0xC4746251, 0xE7A436CD }, |
||||
{ 0x393AD185, 0xC36FBAF9 }, |
||||
{ 0x33EDFC13, 0x3EEDBA18 } |
||||
}; |
||||
|
||||
static __constant__ int ROT256[8][4] = |
||||
{ |
||||
46,36, 19, 37, |
||||
33,27, 14, 42, |
||||
17,49, 36, 39, |
||||
44, 9, 54, 56, |
||||
39,30, 34, 24, |
||||
13,50, 10, 17, |
||||
25,29, 39, 43, |
||||
8, 35, 56, 22, |
||||
}; |
||||
|
||||
static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA}; |
||||
static __constant__ uint2 t12[6] = { |
||||
{ 0x20, 0 }, |
||||
{ 0, 0xf0000000 }, |
||||
{ 0x20, 0xf0000000 }, |
||||
{ 0x08, 0 }, |
||||
{ 0, 0xff000000 }, |
||||
{ 0x08, 0xff000000 } |
||||
}; |
||||
|
||||
#if 0 |
||||
static __constant__ uint64_t t12_30[6] = { |
||||
0x20, |
||||
0xf000000000000000, |
||||
0xf000000000000020, |
||||
0x08, |
||||
0xff00000000000000, |
||||
0xff00000000000008 |
||||
}; |
||||
#endif |
||||
|
||||
static __forceinline__ __device__ |
||||
void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT) |
||||
{ |
||||
p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]); p1 ^= p0; |
||||
p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]); p3 ^= p2; |
||||
p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]); p5 ^= p4; |
||||
p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]); p7 ^= p6; |
||||
} |
||||
|
||||
|
||||
static __forceinline__ __device__ |
||||
void Round_8_512v35(uint2 *ks, uint2 *ts, |
||||
uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, |
||||
uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R) |
||||
{ |
||||
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 0); |
||||
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 1); |
||||
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 2); |
||||
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 3); |
||||
p0 += ks[((R)+0) % 9]; /* inject the key schedule value */ |
||||
p1 += ks[((R)+1) % 9]; |
||||
p2 += ks[((R)+2) % 9]; |
||||
p3 += ks[((R)+3) % 9]; |
||||
p4 += ks[((R)+4) % 9]; |
||||
p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3]; |
||||
p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; |
||||
p7 += ks[((R)+7) % 9] + make_uint2((R),0); |
||||
Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 4); |
||||
Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 5); |
||||
Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 6); |
||||
Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 7); |
||||
p0 += ks[((R)+1) % 9]; /* inject the key schedule value */ |
||||
p1 += ks[((R)+2) % 9]; |
||||
p2 += ks[((R)+3) % 9]; |
||||
p3 += ks[((R)+4) % 9]; |
||||
p4 += ks[((R)+5) % 9]; |
||||
p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; |
||||
p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; |
||||
p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0); |
||||
} |
||||
|
||||
|
||||
__global__ __launch_bounds__(256,3) |
||||
void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) |
||||
{ |
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
uint2 h[9]; |
||||
uint2 t[3]; |
||||
uint2 dt0,dt1,dt2,dt3; |
||||
uint2 p0, p1, p2, p3, p4, p5, p6, p7; |
||||
|
||||
h[8] = skein_ks_parity; |
||||
for (int i = 0; i<8; i++) { |
||||
h[i] = vSKEIN_IV512_256[i]; |
||||
h[8] ^= h[i]; |
||||
} |
||||
|
||||
t[0]=t12[0]; |
||||
t[1]=t12[1]; |
||||
t[2]=t12[2]; |
||||
|
||||
LOHI(dt0.x,dt0.y,outputHash[thread]); |
||||
LOHI(dt1.x,dt1.y,outputHash[threads+thread]); |
||||
LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]); |
||||
LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]); |
||||
|
||||
p0 = h[0] + dt0; |
||||
p1 = h[1] + dt1; |
||||
p2 = h[2] + dt2; |
||||
p3 = h[3] + dt3; |
||||
p4 = h[4]; |
||||
p5 = h[5] + t[0]; |
||||
p6 = h[6] + t[1]; |
||||
p7 = h[7]; |
||||
|
||||
#pragma unroll |
||||
for (int i = 1; i<19; i+=2) { |
||||
Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i); |
||||
} |
||||
|
||||
p0 ^= dt0; |
||||
p1 ^= dt1; |
||||
p2 ^= dt2; |
||||
p3 ^= dt3; |
||||
|
||||
h[0] = p0; |
||||
h[1] = p1; |
||||
h[2] = p2; |
||||
h[3] = p3; |
||||
h[4] = p4; |
||||
h[5] = p5; |
||||
h[6] = p6; |
||||
h[7] = p7; |
||||
h[8] = skein_ks_parity; |
||||
|
||||
#pragma unroll 8 |
||||
for (int i = 0; i<8; i++) { |
||||
h[8] ^= h[i]; |
||||
} |
||||
|
||||
t[0] = t12[3]; |
||||
t[1] = t12[4]; |
||||
t[2] = t12[5]; |
||||
p5 += t[0]; //p5 already equal h[5] |
||||
p6 += t[1]; |
||||
|
||||
#pragma unroll |
||||
for (int i = 1; i<19; i+=2) { |
||||
Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i); |
||||
} |
||||
|
||||
outputHash[thread] = devectorize(p0); |
||||
outputHash[threads+thread] = devectorize(p1); |
||||
outputHash[2*threads+thread] = devectorize(p2); |
||||
outputHash[3*threads+thread] = devectorize(p3); |
||||
} |
||||
} |
||||
|
||||
__host__ |
||||
void skein256_cpu_init(int thr_id, int threads) |
||||
{ |
||||
//empty |
||||
} |
||||
|
||||
__host__ |
||||
void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) |
||||
{ |
||||
const int threadsperblock = 256; |
||||
|
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
|
||||
skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash); |
||||
|
||||
MyStreamSynchronize(NULL, order, thr_id); |
||||
} |
||||
|
@ -0,0 +1,211 @@
@@ -0,0 +1,211 @@
|
||||
/**
|
||||
* Implementation of the Lyra2 Password Hashing Scheme (PHS). |
||||
* |
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
* |
||||
* This software is hereby placed in the public domain. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS |
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||
*/ |
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <string.h> |
||||
#include <time.h> |
||||
|
||||
#include "Lyra2.h" |
||||
#include "Sponge.h" |
||||
|
||||
/**
|
||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords |
||||
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, |
||||
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all |
||||
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value |
||||
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). |
||||
* |
||||
* @param K The derived key to be output by the algorithm |
||||
* @param kLen Desired key length |
||||
* @param pwd User password |
||||
* @param pwdlen Password length |
||||
* @param salt Salt |
||||
* @param saltlen Salt length |
||||
* @param timeCost Parameter to determine the processing time (T) |
||||
* @param nRows Number or rows of the memory matrix (R) |
||||
* @param nCols Number of columns of the memory matrix (C) |
||||
* |
||||
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) |
||||
*/ |
||||
int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols) |
||||
{ |
||||
//============================= Basic variables ============================//
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
int64_t i; //auxiliary iteration counter
|
||||
//==========================================================================/
|
||||
|
||||
//========== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES); |
||||
uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i); |
||||
if (wholeMatrix == NULL) { |
||||
return -1; |
||||
} |
||||
memset(wholeMatrix, 0, (size_t) i); |
||||
|
||||
//Allocates pointers to each row of the matrix
|
||||
uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*)); |
||||
if (memMatrix == NULL) { |
||||
return -1; |
||||
} |
||||
//Places the pointers in the correct positions
|
||||
uint64_t *ptrWord = wholeMatrix; |
||||
for (i = 0; i < (int64_t) nRows; i++) { |
||||
memMatrix[i] = ptrWord; |
||||
ptrWord += ROW_LEN_INT64; |
||||
} |
||||
//==========================================================================/
|
||||
|
||||
//============= Getting the password + salt + basil padded with 10*1 ===============//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; |
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix; |
||||
memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES); |
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, (size_t) pwdlen); |
||||
ptrByte += pwdlen; |
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, (size_t) saltlen); |
||||
ptrByte += saltlen; |
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof (uint64_t)); |
||||
ptrByte += sizeof (uint64_t); |
||||
memcpy(ptrByte, &pwdlen, sizeof (uint64_t)); |
||||
ptrByte += sizeof (uint64_t); |
||||
memcpy(ptrByte, &saltlen, sizeof (uint64_t)); |
||||
ptrByte += sizeof (uint64_t); |
||||
memcpy(ptrByte, &timeCost, sizeof (uint64_t)); |
||||
ptrByte += sizeof (uint64_t); |
||||
memcpy(ptrByte, &nRows, sizeof (uint64_t)); |
||||
ptrByte += sizeof (uint64_t); |
||||
memcpy(ptrByte, &nCols, sizeof (uint64_t)); |
||||
ptrByte += sizeof (uint64_t); |
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
//==========================================================================/
|
||||
|
||||
//======================= Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
uint64_t *state = malloc(16 * sizeof (uint64_t)); |
||||
if (state == NULL) { |
||||
return -1; |
||||
} |
||||
initState(state); |
||||
//==========================================================================/
|
||||
|
||||
//================================ Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
ptrWord = wholeMatrix; |
||||
for (i = 0; i < (int64_t) nBlocksInput; i++) { |
||||
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
|
||||
} |
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1(state, memMatrix[0], memMatrix[1]); |
||||
|
||||
do { |
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); |
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1); |
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row; |
||||
//updates row: goes to the next row to be computed
|
||||
row++; |
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0) { |
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
} |
||||
|
||||
} while (row < (int64_t) nRows); |
||||
//==========================================================================/
|
||||
|
||||
//============================ Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= (int64_t) timeCost; tau++) { |
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; |
||||
do { |
||||
//Selects a pseudorandom index row*
|
||||
//------------------------------------------------------------------------------------------
|
||||
//rowa = ((unsigned int)state[0]) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//------------------------------------------------------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]); |
||||
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row; |
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//------------------------------------------------------------------------------------------
|
||||
//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//------------------------------------------------------------------------------------------
|
||||
|
||||
} while (row != 0); |
||||
} |
||||
//==========================================================================/
|
||||
|
||||
//============================ Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, memMatrix[rowa]); |
||||
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (size_t) kLen); |
||||
//==========================================================================/
|
||||
|
||||
//========================= Freeing the memory =============================//
|
||||
free(memMatrix); |
||||
free(wholeMatrix); |
||||
|
||||
//Wiping out the sponge's internal state before freeing it
|
||||
memset(state, 0, 16 * sizeof (uint64_t)); |
||||
free(state); |
||||
//==========================================================================/
|
||||
|
||||
return 0; |
||||
} |
@ -0,0 +1,50 @@
@@ -0,0 +1,50 @@
|
||||
/**
|
||||
* Header file for the Lyra2 Password Hashing Scheme (PHS). |
||||
* |
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
* |
||||
* This software is hereby placed in the public domain. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS |
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||
*/ |
||||
#ifndef LYRA2_H_ |
||||
#define LYRA2_H_ |
||||
|
||||
#include <stdint.h> |
||||
|
||||
typedef unsigned char byte; |
||||
|
||||
//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
|
||||
#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t)
|
||||
#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes
|
||||
|
||||
|
||||
#ifdef BLOCK_LEN_BITS |
||||
#define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t)
|
||||
#define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes
|
||||
#else //default block lenght: 768 bits
|
||||
#define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t)
|
||||
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
|
||||
#endif |
||||
|
||||
#ifndef N_COLS |
||||
#define N_COLS 8 //Number of columns in the memory matrix: fixed to 64 by default
|
||||
#endif |
||||
|
||||
#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
|
||||
#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8) //Number of bytes per row
|
||||
|
||||
|
||||
int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols); |
||||
|
||||
#endif /* LYRA2_H_ */ |
@ -0,0 +1,755 @@
@@ -0,0 +1,755 @@
|
||||
/**
|
||||
* A simple implementation of Blake2b's internal permutation |
||||
* in the form of a sponge. |
||||
* |
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
* |
||||
* This software is hereby placed in the public domain. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS |
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||
*/ |
||||
#include <string.h> |
||||
#include <stdio.h> |
||||
#include <time.h> |
||||
#include "Sponge.h" |
||||
#include "Lyra2.h" |
||||
|
||||
|
||||
|
||||
/**
|
||||
* Initializes the Sponge State. The first 512 bits are set to zeros and the remainder |
||||
* receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges |
||||
* typically have their internal state initialized with zeros, Blake2b's G function |
||||
* has a fixed point: if the internal state and message are both filled with zeros. the |
||||
* resulting permutation will always be a block filled with zeros; this happens because |
||||
* Blake2b does not use the constants originally employed in Blake2 inside its G function, |
||||
* relying on the IV for avoiding possible fixed points. |
||||
* |
||||
* @param state The 1024-bit array to be initialized |
||||
*/ |
||||
void initState(uint64_t state[/*16*/]) { |
||||
//First 512 bis are zeros
|
||||
memset(state, 0, 64); |
||||
//Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
|
||||
|
||||
state[8] = blake2b_IV[0]; |
||||
state[9] = blake2b_IV[1]; |
||||
state[10] = blake2b_IV[2]; |
||||
state[11] = blake2b_IV[3]; |
||||
state[12] = blake2b_IV[4]; |
||||
state[13] = blake2b_IV[5]; |
||||
state[14] = blake2b_IV[6]; |
||||
state[15] = blake2b_IV[7]; |
||||
|
||||
} |
||||
|
||||
/**
|
||||
* Execute Blake2b's G function, with all 12 rounds. |
||||
* |
||||
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function |
||||
*/ |
||||
__inline static void blake2bLyra(uint64_t *v) { |
||||
ROUND_LYRA(0); |
||||
ROUND_LYRA(1); |
||||
ROUND_LYRA(2); |
||||
ROUND_LYRA(3); |
||||
ROUND_LYRA(4); |
||||
ROUND_LYRA(5); |
||||
ROUND_LYRA(6); |
||||
ROUND_LYRA(7); |
||||
ROUND_LYRA(8); |
||||
ROUND_LYRA(9); |
||||
ROUND_LYRA(10); |
||||
ROUND_LYRA(11); |
||||
} |
||||
|
||||
/**
|
||||
* Executes a reduced version of Blake2b's G function with only one round |
||||
* @param v A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function |
||||
*/ |
||||
__inline static void reducedBlake2bLyra(uint64_t *v) { |
||||
ROUND_LYRA(0); |
||||
} |
||||
|
||||
/**
|
||||
* Performs a squeeze operation, using Blake2b's G function as the |
||||
* internal permutation |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param out Array that will receive the data squeezed |
||||
* @param len The number of bytes to be squeezed into the "out" array |
||||
*/ |
||||
void squeeze(uint64_t *state, byte *out, unsigned int len) { |
||||
int fullBlocks = len / BLOCK_LEN_BYTES; |
||||
byte *ptr = out; |
||||
int i; |
||||
//Squeezes full blocks
|
||||
for (i = 0; i < fullBlocks; i++) { |
||||
memcpy(ptr, state, BLOCK_LEN_BYTES); |
||||
blake2bLyra(state); |
||||
ptr += BLOCK_LEN_BYTES; |
||||
} |
||||
|
||||
//Squeezes remaining bytes
|
||||
memcpy(ptr, state, (len % BLOCK_LEN_BYTES)); |
||||
} |
||||
|
||||
/**
|
||||
* Performs an absorb operation for a single block (BLOCK_LEN_INT64 words |
||||
* of type uint64_t), using Blake2b's G function as the internal permutation |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param in The block to be absorbed (BLOCK_LEN_INT64 words) |
||||
*/ |
||||
void absorbBlock(uint64_t *state, const uint64_t *in) { |
||||
//XORs the first BLOCK_LEN_INT64 words of "in" with the current state
|
||||
state[0] ^= in[0]; |
||||
state[1] ^= in[1]; |
||||
state[2] ^= in[2]; |
||||
state[3] ^= in[3]; |
||||
state[4] ^= in[4]; |
||||
state[5] ^= in[5]; |
||||
state[6] ^= in[6]; |
||||
state[7] ^= in[7]; |
||||
state[8] ^= in[8]; |
||||
state[9] ^= in[9]; |
||||
state[10] ^= in[10]; |
||||
state[11] ^= in[11]; |
||||
|
||||
//Applies the transformation f to the sponge's state
|
||||
blake2bLyra(state); |
||||
} |
||||
|
||||
/**
|
||||
* Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64 |
||||
* words of type uint64_t), using Blake2b's G function as the internal permutation |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param in The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words) |
||||
*/ |
||||
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) { |
||||
//XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
|
||||
state[0] ^= in[0]; |
||||
state[1] ^= in[1]; |
||||
state[2] ^= in[2]; |
||||
state[3] ^= in[3]; |
||||
state[4] ^= in[4]; |
||||
state[5] ^= in[5]; |
||||
state[6] ^= in[6]; |
||||
state[7] ^= in[7]; |
||||
|
||||
//Applies the transformation f to the sponge's state
|
||||
blake2bLyra(state); |
||||
/*
|
||||
for(int i = 0; i<16; i++) { |
||||
printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32), |
||||
(uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32)); |
||||
} |
||||
*/ |
||||
} |
||||
|
||||
/**
|
||||
* Performs a reduced squeeze operation for a single row, from the highest to |
||||
* the lowest index, using the reduced-round Blake2b's G function as the |
||||
* internal permutation |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param rowOut Row to receive the data squeezed |
||||
*/ |
||||
void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) { |
||||
uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
|
||||
int i; |
||||
//M[row][C-1-col] = H.reduced_squeeze()
|
||||
for (i = 0; i < N_COLS; i++) { |
||||
|
||||
ptrWord[0] = state[0]; |
||||
ptrWord[1] = state[1]; |
||||
ptrWord[2] = state[2]; |
||||
ptrWord[3] = state[3]; |
||||
ptrWord[4] = state[4]; |
||||
ptrWord[5] = state[5]; |
||||
ptrWord[6] = state[6]; |
||||
ptrWord[7] = state[7]; |
||||
ptrWord[8] = state[8]; |
||||
ptrWord[9] = state[9]; |
||||
ptrWord[10] = state[10]; |
||||
ptrWord[11] = state[11]; |
||||
/*
|
||||
for (int i = 0; i<12; i++) { |
||||
printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32), |
||||
(uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32)); |
||||
} |
||||
*/ |
||||
//Goes to next block (column) that will receive the squeezed data
|
||||
ptrWord -= BLOCK_LEN_INT64; |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
} |
||||
} |
||||
|
||||
/**
|
||||
* Performs a reduced duplex operation for a single row, from the highest to |
||||
* the lowest index, using the reduced-round Blake2b's G function as the |
||||
* internal permutation |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param rowIn Row to feed the sponge |
||||
* @param rowOut Row to receive the sponge's output |
||||
*/ |
||||
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) { |
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
int i; |
||||
|
||||
for (i = 0; i < N_COLS; i++) { |
||||
|
||||
//Absorbing "M[prev][col]"
|
||||
state[0] ^= (ptrWordIn[0]); |
||||
state[1] ^= (ptrWordIn[1]); |
||||
state[2] ^= (ptrWordIn[2]); |
||||
state[3] ^= (ptrWordIn[3]); |
||||
state[4] ^= (ptrWordIn[4]); |
||||
state[5] ^= (ptrWordIn[5]); |
||||
state[6] ^= (ptrWordIn[6]); |
||||
state[7] ^= (ptrWordIn[7]); |
||||
state[8] ^= (ptrWordIn[8]); |
||||
state[9] ^= (ptrWordIn[9]); |
||||
state[10] ^= (ptrWordIn[10]); |
||||
state[11] ^= (ptrWordIn[11]); |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
|
||||
//M[row][C-1-col] = M[prev][col] XOR rand
|
||||
ptrWordOut[0] = ptrWordIn[0] ^ state[0]; |
||||
ptrWordOut[1] = ptrWordIn[1] ^ state[1]; |
||||
ptrWordOut[2] = ptrWordIn[2] ^ state[2]; |
||||
ptrWordOut[3] = ptrWordIn[3] ^ state[3]; |
||||
ptrWordOut[4] = ptrWordIn[4] ^ state[4]; |
||||
ptrWordOut[5] = ptrWordIn[5] ^ state[5]; |
||||
ptrWordOut[6] = ptrWordIn[6] ^ state[6]; |
||||
ptrWordOut[7] = ptrWordIn[7] ^ state[7]; |
||||
ptrWordOut[8] = ptrWordIn[8] ^ state[8]; |
||||
ptrWordOut[9] = ptrWordIn[9] ^ state[9]; |
||||
ptrWordOut[10] = ptrWordIn[10] ^ state[10]; |
||||
ptrWordOut[11] = ptrWordIn[11] ^ state[11]; |
||||
|
||||
|
||||
//Input: next column (i.e., next block in sequence)
|
||||
ptrWordIn += BLOCK_LEN_INT64; |
||||
//Output: goes to previous column
|
||||
ptrWordOut -= BLOCK_LEN_INT64; |
||||
} |
||||
} |
||||
|
||||
/**
|
||||
* Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., |
||||
* the wordwise addition of two columns, ignoring carries between words). The |
||||
* output of this operation, "rand", is then used to make |
||||
* "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and |
||||
* "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit |
||||
* rotation to the left and N_COLS is a system parameter. |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param rowIn Row used only as input |
||||
* @param rowInOut Row used as input and to receive output after rotation |
||||
* @param rowOut Row receiving the output |
||||
* |
||||
*/ |
||||
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { |
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
|
||||
int i; |
||||
for (i = 0; i < N_COLS; i++) { |
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); |
||||
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); |
||||
state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); |
||||
state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); |
||||
state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); |
||||
state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); |
||||
state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); |
||||
state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); |
||||
state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); |
||||
state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); |
||||
state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); |
||||
state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
|
||||
//M[row][col] = M[prev][col] XOR rand
|
||||
ptrWordOut[0] = ptrWordIn[0] ^ state[0]; |
||||
ptrWordOut[1] = ptrWordIn[1] ^ state[1]; |
||||
ptrWordOut[2] = ptrWordIn[2] ^ state[2]; |
||||
ptrWordOut[3] = ptrWordIn[3] ^ state[3]; |
||||
ptrWordOut[4] = ptrWordIn[4] ^ state[4]; |
||||
ptrWordOut[5] = ptrWordIn[5] ^ state[5]; |
||||
ptrWordOut[6] = ptrWordIn[6] ^ state[6]; |
||||
ptrWordOut[7] = ptrWordIn[7] ^ state[7]; |
||||
ptrWordOut[8] = ptrWordIn[8] ^ state[8]; |
||||
ptrWordOut[9] = ptrWordIn[9] ^ state[9]; |
||||
ptrWordOut[10] = ptrWordIn[10] ^ state[10]; |
||||
ptrWordOut[11] = ptrWordIn[11] ^ state[11]; |
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
ptrWordInOut[0] ^= state[11]; |
||||
ptrWordInOut[1] ^= state[0]; |
||||
ptrWordInOut[2] ^= state[1]; |
||||
ptrWordInOut[3] ^= state[2]; |
||||
ptrWordInOut[4] ^= state[3]; |
||||
ptrWordInOut[5] ^= state[4]; |
||||
ptrWordInOut[6] ^= state[5]; |
||||
ptrWordInOut[7] ^= state[6]; |
||||
ptrWordInOut[8] ^= state[7]; |
||||
ptrWordInOut[9] ^= state[8]; |
||||
ptrWordInOut[10] ^= state[9]; |
||||
ptrWordInOut[11] ^= state[10]; |
||||
|
||||
//Inputs: next column (i.e., next block in sequence)
|
||||
ptrWordInOut += BLOCK_LEN_INT64; |
||||
ptrWordIn += BLOCK_LEN_INT64; |
||||
//Output: goes to previous column
|
||||
ptrWordOut -= BLOCK_LEN_INT64; |
||||
} |
||||
} |
||||
|
||||
/**
|
||||
* Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e., |
||||
* the wordwise addition of two columns, ignoring carries between words). The |
||||
* output of this operation, "rand", is then used to make |
||||
* "M[rowOut][col] = M[rowOut][col] XOR rand" and |
||||
* "M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit |
||||
* rotation to the left. |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param rowIn Row used only as input |
||||
* @param rowInOut Row used as input and to receive output after rotation |
||||
* @param rowOut Row receiving the output |
||||
* |
||||
*/ |
||||
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { |
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
int i; |
||||
|
||||
for (i = 0; i < N_COLS; i++) { |
||||
|
||||
//Absorbing "M[prev] [+] M[row*]"
|
||||
state[0] ^= (ptrWordIn[0] + ptrWordInOut[0]); |
||||
state[1] ^= (ptrWordIn[1] + ptrWordInOut[1]); |
||||
state[2] ^= (ptrWordIn[2] + ptrWordInOut[2]); |
||||
state[3] ^= (ptrWordIn[3] + ptrWordInOut[3]); |
||||
state[4] ^= (ptrWordIn[4] + ptrWordInOut[4]); |
||||
state[5] ^= (ptrWordIn[5] + ptrWordInOut[5]); |
||||
state[6] ^= (ptrWordIn[6] + ptrWordInOut[6]); |
||||
state[7] ^= (ptrWordIn[7] + ptrWordInOut[7]); |
||||
state[8] ^= (ptrWordIn[8] + ptrWordInOut[8]); |
||||
state[9] ^= (ptrWordIn[9] + ptrWordInOut[9]); |
||||
state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]); |
||||
state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]); |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
|
||||
//M[rowOut][col] = M[rowOut][col] XOR rand
|
||||
ptrWordOut[0] ^= state[0]; |
||||
ptrWordOut[1] ^= state[1]; |
||||
ptrWordOut[2] ^= state[2]; |
||||
ptrWordOut[3] ^= state[3]; |
||||
ptrWordOut[4] ^= state[4]; |
||||
ptrWordOut[5] ^= state[5]; |
||||
ptrWordOut[6] ^= state[6]; |
||||
ptrWordOut[7] ^= state[7]; |
||||
ptrWordOut[8] ^= state[8]; |
||||
ptrWordOut[9] ^= state[9]; |
||||
ptrWordOut[10] ^= state[10]; |
||||
ptrWordOut[11] ^= state[11]; |
||||
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
ptrWordInOut[0] ^= state[11]; |
||||
ptrWordInOut[1] ^= state[0]; |
||||
ptrWordInOut[2] ^= state[1]; |
||||
ptrWordInOut[3] ^= state[2]; |
||||
ptrWordInOut[4] ^= state[3]; |
||||
ptrWordInOut[5] ^= state[4]; |
||||
ptrWordInOut[6] ^= state[5]; |
||||
ptrWordInOut[7] ^= state[6]; |
||||
ptrWordInOut[8] ^= state[7]; |
||||
ptrWordInOut[9] ^= state[8]; |
||||
ptrWordInOut[10] ^= state[9]; |
||||
ptrWordInOut[11] ^= state[10]; |
||||
|
||||
//Goes to next block
|
||||
ptrWordOut += BLOCK_LEN_INT64; |
||||
ptrWordInOut += BLOCK_LEN_INT64; |
||||
ptrWordIn += BLOCK_LEN_INT64; |
||||
} |
||||
} |
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand" |
||||
* on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit |
||||
* rotation to the left. |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param rowIn Row used only as input |
||||
* @param rowInOut Row used as input and to receive output after rotation |
||||
* @param rowOut Row receiving the output |
||||
* |
||||
*/ |
||||
/*
|
||||
inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { |
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
int i; |
||||
for (i = 0; i < N_COLS; i++) { |
||||
|
||||
//Absorbing "M[rowInOut] XOR M[rowIn]"
|
||||
state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0]; |
||||
state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1]; |
||||
state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2]; |
||||
state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3]; |
||||
state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4]; |
||||
state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5]; |
||||
state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6]; |
||||
state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7]; |
||||
state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8]; |
||||
state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9]; |
||||
state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10]; |
||||
state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11]; |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
|
||||
//M[row][col] = rand
|
||||
ptrWordOut[0] = state[0]; |
||||
ptrWordOut[1] = state[1]; |
||||
ptrWordOut[2] = state[2]; |
||||
ptrWordOut[3] = state[3]; |
||||
ptrWordOut[4] = state[4]; |
||||
ptrWordOut[5] = state[5]; |
||||
ptrWordOut[6] = state[6]; |
||||
ptrWordOut[7] = state[7]; |
||||
ptrWordOut[8] = state[8]; |
||||
ptrWordOut[9] = state[9]; |
||||
ptrWordOut[10] = state[10]; |
||||
ptrWordOut[11] = state[11]; |
||||
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
ptrWordInOut[0] ^= state[10]; |
||||
ptrWordInOut[1] ^= state[11]; |
||||
ptrWordInOut[2] ^= state[0]; |
||||
ptrWordInOut[3] ^= state[1]; |
||||
ptrWordInOut[4] ^= state[2]; |
||||
ptrWordInOut[5] ^= state[3]; |
||||
ptrWordInOut[6] ^= state[4]; |
||||
ptrWordInOut[7] ^= state[5]; |
||||
ptrWordInOut[8] ^= state[6]; |
||||
ptrWordInOut[9] ^= state[7]; |
||||
ptrWordInOut[10] ^= state[8]; |
||||
ptrWordInOut[11] ^= state[9]; |
||||
|
||||
//Goes to next column (i.e., next block in sequence)
|
||||
ptrWordInOut += BLOCK_LEN_INT64; |
||||
ptrWordIn += BLOCK_LEN_INT64; |
||||
ptrWordOut += BLOCK_LEN_INT64; |
||||
} |
||||
} |
||||
*/ |
||||
|
||||
/**
|
||||
* Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" |
||||
* on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit |
||||
* rotation to the left. |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param rowIn Row used only as input |
||||
* @param rowInOut Row used as input and to receive output after rotation |
||||
* @param rowOut Row receiving the output |
||||
* |
||||
*/ |
||||
/*
|
||||
inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { |
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
int i; |
||||
for (i = 0; i < N_COLS; i++) { |
||||
|
||||
//Absorbing "M[rowInOut] XOR M[rowIn]"
|
||||
state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; |
||||
state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; |
||||
state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; |
||||
state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; |
||||
state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; |
||||
state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; |
||||
state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; |
||||
state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; |
||||
state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; |
||||
state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; |
||||
state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; |
||||
state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
ptrWordInOut[0] ^= state[10]; |
||||
ptrWordInOut[1] ^= state[11]; |
||||
ptrWordInOut[2] ^= state[0]; |
||||
ptrWordInOut[3] ^= state[1]; |
||||
ptrWordInOut[4] ^= state[2]; |
||||
ptrWordInOut[5] ^= state[3]; |
||||
ptrWordInOut[6] ^= state[4]; |
||||
ptrWordInOut[7] ^= state[5]; |
||||
ptrWordInOut[8] ^= state[6]; |
||||
ptrWordInOut[9] ^= state[7]; |
||||
ptrWordInOut[10] ^= state[8]; |
||||
ptrWordInOut[11] ^= state[9]; |
||||
|
||||
|
||||
//M[row][col] = rand
|
||||
ptrWordOut[0] = state[0] ^ ptrWordIn[0]; |
||||
ptrWordOut[1] = state[1] ^ ptrWordIn[1]; |
||||
ptrWordOut[2] = state[2] ^ ptrWordIn[2]; |
||||
ptrWordOut[3] = state[3] ^ ptrWordIn[3]; |
||||
ptrWordOut[4] = state[4] ^ ptrWordIn[4]; |
||||
ptrWordOut[5] = state[5] ^ ptrWordIn[5]; |
||||
ptrWordOut[6] = state[6] ^ ptrWordIn[6]; |
||||
ptrWordOut[7] = state[7] ^ ptrWordIn[7]; |
||||
ptrWordOut[8] = state[8] ^ ptrWordIn[8]; |
||||
ptrWordOut[9] = state[9] ^ ptrWordIn[9]; |
||||
ptrWordOut[10] = state[10] ^ ptrWordIn[10]; |
||||
ptrWordOut[11] = state[11] ^ ptrWordIn[11]; |
||||
|
||||
//Goes to next column (i.e., next block in sequence)
|
||||
ptrWordInOut += BLOCK_LEN_INT64; |
||||
ptrWordIn += BLOCK_LEN_INT64; |
||||
ptrWordOut += BLOCK_LEN_INT64; |
||||
} |
||||
} |
||||
*/ |
||||
|
||||
/**
|
||||
* Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand" |
||||
* on M[rowOut] and making "M[rowInOut] = M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit |
||||
* rotation to the left. |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param rowIn Row used only as input |
||||
* @param rowInOut Row used as input and to receive output after rotation |
||||
* @param rowOut Row receiving the output |
||||
* |
||||
*/ |
||||
/*
|
||||
inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { |
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordOut = rowOut; |
||||
int i; |
||||
|
||||
for (i = 0; i < N_COLS / 2; i++) { |
||||
//Absorbing "M[rowInOut] XOR M[rowIn]"
|
||||
state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; |
||||
state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; |
||||
state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; |
||||
state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; |
||||
state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; |
||||
state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; |
||||
state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; |
||||
state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; |
||||
state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; |
||||
state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; |
||||
state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; |
||||
state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
ptrWordInOut[0] ^= state[10]; |
||||
ptrWordInOut[1] ^= state[11]; |
||||
ptrWordInOut[2] ^= state[0]; |
||||
ptrWordInOut[3] ^= state[1]; |
||||
ptrWordInOut[4] ^= state[2]; |
||||
ptrWordInOut[5] ^= state[3]; |
||||
ptrWordInOut[6] ^= state[4]; |
||||
ptrWordInOut[7] ^= state[5]; |
||||
ptrWordInOut[8] ^= state[6]; |
||||
ptrWordInOut[9] ^= state[7]; |
||||
ptrWordInOut[10] ^= state[8]; |
||||
ptrWordInOut[11] ^= state[9]; |
||||
|
||||
|
||||
//M[row][col] = rand
|
||||
ptrWordOut[0] = state[0] ^ ptrWordIn[0]; |
||||
ptrWordOut[1] = state[1] ^ ptrWordIn[1]; |
||||
ptrWordOut[2] = state[2] ^ ptrWordIn[2]; |
||||
ptrWordOut[3] = state[3] ^ ptrWordIn[3]; |
||||
ptrWordOut[4] = state[4] ^ ptrWordIn[4]; |
||||
ptrWordOut[5] = state[5] ^ ptrWordIn[5]; |
||||
ptrWordOut[6] = state[6] ^ ptrWordIn[6]; |
||||
ptrWordOut[7] = state[7] ^ ptrWordIn[7]; |
||||
ptrWordOut[8] = state[8] ^ ptrWordIn[8]; |
||||
ptrWordOut[9] = state[9] ^ ptrWordIn[9]; |
||||
ptrWordOut[10] = state[10] ^ ptrWordIn[10]; |
||||
ptrWordOut[11] = state[11] ^ ptrWordIn[11]; |
||||
|
||||
//Goes to next column (i.e., next block in sequence)
|
||||
ptrWordInOut += BLOCK_LEN_INT64; |
||||
ptrWordIn += BLOCK_LEN_INT64; |
||||
ptrWordOut += 2 * BLOCK_LEN_INT64; |
||||
} |
||||
|
||||
ptrWordOut = rowOut + BLOCK_LEN_INT64; |
||||
for (i = 0; i < N_COLS / 2; i++) { |
||||
//Absorbing "M[rowInOut] XOR M[rowIn]"
|
||||
state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; |
||||
state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; |
||||
state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; |
||||
state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; |
||||
state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; |
||||
state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; |
||||
state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; |
||||
state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; |
||||
state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; |
||||
state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; |
||||
state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; |
||||
state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
|
||||
|
||||
//M[row*][col] = M[row*][col] XOR rotW(rand)
|
||||
ptrWordInOut[0] ^= state[10]; |
||||
ptrWordInOut[1] ^= state[11]; |
||||
ptrWordInOut[2] ^= state[0]; |
||||
ptrWordInOut[3] ^= state[1]; |
||||
ptrWordInOut[4] ^= state[2]; |
||||
ptrWordInOut[5] ^= state[3]; |
||||
ptrWordInOut[6] ^= state[4]; |
||||
ptrWordInOut[7] ^= state[5]; |
||||
ptrWordInOut[8] ^= state[6]; |
||||
ptrWordInOut[9] ^= state[7]; |
||||
ptrWordInOut[10] ^= state[8]; |
||||
ptrWordInOut[11] ^= state[9]; |
||||
|
||||
|
||||
//M[row][col] = rand
|
||||
ptrWordOut[0] = state[0] ^ ptrWordIn[0]; |
||||
ptrWordOut[1] = state[1] ^ ptrWordIn[1]; |
||||
ptrWordOut[2] = state[2] ^ ptrWordIn[2]; |
||||
ptrWordOut[3] = state[3] ^ ptrWordIn[3]; |
||||
ptrWordOut[4] = state[4] ^ ptrWordIn[4]; |
||||
ptrWordOut[5] = state[5] ^ ptrWordIn[5]; |
||||
ptrWordOut[6] = state[6] ^ ptrWordIn[6]; |
||||
ptrWordOut[7] = state[7] ^ ptrWordIn[7]; |
||||
ptrWordOut[8] = state[8] ^ ptrWordIn[8]; |
||||
ptrWordOut[9] = state[9] ^ ptrWordIn[9]; |
||||
ptrWordOut[10] = state[10] ^ ptrWordIn[10]; |
||||
ptrWordOut[11] = state[11] ^ ptrWordIn[11]; |
||||
|
||||
//Goes to next column (i.e., next block in sequence)
|
||||
ptrWordInOut += BLOCK_LEN_INT64; |
||||
ptrWordIn += BLOCK_LEN_INT64; |
||||
ptrWordOut += 2 * BLOCK_LEN_INT64; |
||||
} |
||||
} |
||||
*/ |
||||
|
||||
/**
|
||||
* Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand" |
||||
* to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)", |
||||
* where rotW is a 64-bit rotation to the left. |
||||
* |
||||
* @param state The current state of the sponge |
||||
* @param rowIn Row used only as input |
||||
* @param rowInOut Row used as input and to receive output after rotation |
||||
* @param rowOut Row receiving the output |
||||
* |
||||
*/ |
||||
/*
|
||||
inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) { |
||||
uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
|
||||
uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
|
||||
uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
|
||||
int i; |
||||
for (i = 0; i < N_COLS; i++) { |
||||
|
||||
//Absorbing "M[rowInOut] XOR M[rowIn]"
|
||||
state[0] ^= ptrWordInOut[0] + ptrWordIn[0]; |
||||
state[1] ^= ptrWordInOut[1] + ptrWordIn[1]; |
||||
state[2] ^= ptrWordInOut[2] + ptrWordIn[2]; |
||||
state[3] ^= ptrWordInOut[3] + ptrWordIn[3]; |
||||
state[4] ^= ptrWordInOut[4] + ptrWordIn[4]; |
||||
state[5] ^= ptrWordInOut[5] + ptrWordIn[5]; |
||||
state[6] ^= ptrWordInOut[6] + ptrWordIn[6]; |
||||
state[7] ^= ptrWordInOut[7] + ptrWordIn[7]; |
||||
state[8] ^= ptrWordInOut[8] + ptrWordIn[8]; |
||||
state[9] ^= ptrWordInOut[9] + ptrWordIn[9]; |
||||
state[10] ^= ptrWordInOut[10] + ptrWordIn[10]; |
||||
state[11] ^= ptrWordInOut[11] + ptrWordIn[11]; |
||||
|
||||
//Applies the reduced-round transformation f to the sponge's state
|
||||
reducedBlake2bLyra(state); |
||||
|
||||
//M[rowOut][col] = M[rowOut][col] XOR rand
|
||||
ptrWordOut[0] ^= state[0]; |
||||
ptrWordOut[1] ^= state[1]; |
||||
ptrWordOut[2] ^= state[2]; |
||||
ptrWordOut[3] ^= state[3]; |
||||
ptrWordOut[4] ^= state[4]; |
||||
ptrWordOut[5] ^= state[5]; |
||||
ptrWordOut[6] ^= state[6]; |
||||
ptrWordOut[7] ^= state[7]; |
||||
ptrWordOut[8] ^= state[8]; |
||||
ptrWordOut[9] ^= state[9]; |
||||
ptrWordOut[10] ^= state[10]; |
||||
ptrWordOut[11] ^= state[11]; |
||||
|
||||
//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
|
||||
|
||||
|
||||
//Goes to next block
|
||||
ptrWordOut += BLOCK_LEN_INT64; |
||||
ptrWordInOut += BLOCK_LEN_INT64; |
||||
ptrWordIn += BLOCK_LEN_INT64; |
||||
} |
||||
} |
||||
*/ |
||||
|
||||
/**
|
||||
Prints an array of unsigned chars |
||||
*/ |
||||
void printArray(unsigned char *array, unsigned int size, char *name) { |
||||
int i; |
||||
printf("%s: ", name); |
||||
for (i = 0; i < size; i++) { |
||||
printf("%2x|", array[i]); |
||||
} |
||||
printf("\n"); |
||||
} |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
@ -0,0 +1,108 @@
@@ -0,0 +1,108 @@
|
||||
/**
|
||||
* Header file for Blake2b's internal permutation in the form of a sponge. |
||||
* This code is based on the original Blake2b's implementation provided by |
||||
* Samuel Neves (https://blake2.net/)
|
||||
* |
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
* |
||||
* This software is hereby placed in the public domain. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS |
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||
*/ |
||||
#ifndef SPONGE_H_ |
||||
#define SPONGE_H_ |
||||
|
||||
#include <stdint.h> |
||||
|
||||
#if defined(__GNUC__) |
||||
#define ALIGN __attribute__ ((aligned(32))) |
||||
#elif defined(_MSC_VER) |
||||
#define ALIGN __declspec(align(32)) |
||||
#else |
||||
#define ALIGN |
||||
#endif |
||||
|
||||
|
||||
/*Blake2b IV Array*/ |
||||
static const uint64_t blake2b_IV[8] = |
||||
{ |
||||
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, |
||||
0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, |
||||
0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, |
||||
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL |
||||
}; |
||||
|
||||
/*Blake2b's rotation*/ |
||||
static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){ |
||||
return ( w >> c ) | ( w << ( 64 - c ) ); |
||||
} |
||||
|
||||
/*Blake2b's G function*/ |
||||
#define G(r,i,a,b,c,d) \ |
||||
do { \ |
||||
a = a + b; \ |
||||
d = rotr64(d ^ a, 32); \ |
||||
c = c + d; \ |
||||
b = rotr64(b ^ c, 24); \ |
||||
a = a + b; \ |
||||
d = rotr64(d ^ a, 16); \ |
||||
c = c + d; \ |
||||
b = rotr64(b ^ c, 63); \ |
||||
} while(0) |
||||
|
||||
|
||||
/*One Round of the Blake2b's compression function*/ |
||||
#define ROUND_LYRA(r) \ |
||||
G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ |
||||
G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ |
||||
G(r,2,v[ 2],v[ 6],v[10],v[14]); \ |
||||
G(r,3,v[ 3],v[ 7],v[11],v[15]); \ |
||||
G(r,4,v[ 0],v[ 5],v[10],v[15]); \ |
||||
G(r,5,v[ 1],v[ 6],v[11],v[12]); \ |
||||
G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ |
||||
G(r,7,v[ 3],v[ 4],v[ 9],v[14]); |
||||
|
||||
|
||||
//---- Housekeeping
|
||||
void initState(uint64_t state[/*16*/]); |
||||
|
||||
//---- Squeezes
|
||||
void squeeze(uint64_t *state, unsigned char *out, unsigned int len); |
||||
void reducedSqueezeRow0(uint64_t* state, uint64_t* row); |
||||
|
||||
//---- Absorbs
|
||||
void absorbBlock(uint64_t *state, const uint64_t *in); |
||||
void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in); |
||||
|
||||
//---- Duplexes
|
||||
void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut); |
||||
void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); |
||||
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut); |
||||
|
||||
//---- Misc
|
||||
void printArray(unsigned char *array, unsigned int size, char *name); |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
////TESTS////
|
||||
//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
|
||||
//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
|
||||
/////////////
|
||||
|
||||
|
||||
#endif /* SPONGE_H_ */ |
@ -0,0 +1,536 @@
@@ -0,0 +1,536 @@
|
||||
#include <memory.h> |
||||
|
||||
#include "cuda_helper.h" |
||||
|
||||
static __constant__ uint2 blake2b_IV[8] = { |
||||
{ 0xf3bcc908, 0x6a09e667 }, |
||||
{ 0x84caa73b, 0xbb67ae85 }, |
||||
{ 0xfe94f82b, 0x3c6ef372 }, |
||||
{ 0x5f1d36f1, 0xa54ff53a }, |
||||
{ 0xade682d1, 0x510e527f }, |
||||
{ 0x2b3e6c1f, 0x9b05688c }, |
||||
{ 0xfb41bd6b, 0x1f83d9ab }, |
||||
{ 0x137e2179, 0x5be0cd19 } |
||||
}; |
||||
// data: 0-4 outputhash 4-8 outputhash 8-16 basil |
||||
|
||||
#define reduceDuplexRowSetup(rowIn, rowInOut, rowOut) { \ |
||||
for (int i = 0; i < 8; i++) { \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ |
||||
round_lyra_v35(state); \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \ |
||||
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ |
||||
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ |
||||
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ |
||||
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ |
||||
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ |
||||
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ |
||||
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ |
||||
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ |
||||
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ |
||||
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ |
||||
Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ |
||||
Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ |
||||
} \ |
||||
} |
||||
|
||||
#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ |
||||
for (int i = 0; i < 8; i++) { \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ |
||||
round_lyra_v35(state); \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
Matrix[j + 12 * i][rowOut] ^= state[j]; \ |
||||
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ |
||||
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ |
||||
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ |
||||
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ |
||||
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ |
||||
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ |
||||
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ |
||||
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ |
||||
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ |
||||
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ |
||||
Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ |
||||
Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ |
||||
} \ |
||||
} |
||||
|
||||
#define absorbblock(in) { \ |
||||
state[0] ^= Matrix[0][in]; \ |
||||
state[1] ^= Matrix[1][in]; \ |
||||
state[2] ^= Matrix[2][in]; \ |
||||
state[3] ^= Matrix[3][in]; \ |
||||
state[4] ^= Matrix[4][in]; \ |
||||
state[5] ^= Matrix[5][in]; \ |
||||
state[6] ^= Matrix[6][in]; \ |
||||
state[7] ^= Matrix[7][in]; \ |
||||
state[8] ^= Matrix[8][in]; \ |
||||
state[9] ^= Matrix[9][in]; \ |
||||
state[10] ^= Matrix[10][in]; \ |
||||
state[11] ^= Matrix[11][in]; \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
} |
||||
|
||||
//// test version |
||||
#define reduceDuplexRowSetup_test(rowIn, rowInOut, rowOut) { \ |
||||
for (int i = 0; i < 8; i++) { \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \ |
||||
round_lyra_v35(state); \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
Matrix[j][7-i][rowOut] = Matrix[j][i][rowIn] ^ state[j]; \ |
||||
Matrix[0][i][rowInOut] ^= state[11]; \ |
||||
Matrix[1][i][rowInOut] ^= state[0]; \ |
||||
Matrix[2][i][rowInOut] ^= state[1]; \ |
||||
Matrix[3][i][rowInOut] ^= state[2]; \ |
||||
Matrix[4][i][rowInOut] ^= state[3]; \ |
||||
Matrix[5][i][rowInOut] ^= state[4]; \ |
||||
Matrix[6][i][rowInOut] ^= state[5]; \ |
||||
Matrix[7][i][rowInOut] ^= state[6]; \ |
||||
Matrix[8][i][rowInOut] ^= state[7]; \ |
||||
Matrix[9][i][rowInOut] ^= state[8]; \ |
||||
Matrix[10][i][rowInOut] ^= state[9]; \ |
||||
Matrix[11][i][rowInOut] ^= state[10]; \ |
||||
} \ |
||||
} |
||||
|
||||
#define reduceDuplexRow_test(rowIn, rowInOut, rowOut) { \ |
||||
for (int i = 0; i < 8; i++) { \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \ |
||||
round_lyra_v35(state); \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
Matrix[j][i][rowOut] ^= state[j]; \ |
||||
Matrix[0][i][rowInOut] ^= state[11]; \ |
||||
Matrix[1][i][rowInOut] ^= state[0]; \ |
||||
Matrix[2][i][rowInOut] ^= state[1]; \ |
||||
Matrix[3][i][rowInOut] ^= state[2]; \ |
||||
Matrix[4][i][rowInOut] ^= state[3]; \ |
||||
Matrix[5][i][rowInOut] ^= state[4]; \ |
||||
Matrix[6][i][rowInOut] ^= state[5]; \ |
||||
Matrix[7][i][rowInOut] ^= state[6]; \ |
||||
Matrix[8][i][rowInOut] ^= state[7]; \ |
||||
Matrix[9][i][rowInOut] ^= state[8]; \ |
||||
Matrix[10][i][rowInOut] ^= state[9]; \ |
||||
Matrix[11][i][rowInOut] ^= state[10]; \ |
||||
} \ |
||||
} |
||||
|
||||
#define absorbblock_test(in) { \ |
||||
state[0] ^= Matrix[0][0][ in]; \ |
||||
state[1] ^= Matrix[1][0][in]; \ |
||||
state[2] ^= Matrix[2][0][in]; \ |
||||
state[3] ^= Matrix[3][0][in]; \ |
||||
state[4] ^= Matrix[4][0][in]; \ |
||||
state[5] ^= Matrix[5][0][in]; \ |
||||
state[6] ^= Matrix[6][0][in]; \ |
||||
state[7] ^= Matrix[7][0][in]; \ |
||||
state[8] ^= Matrix[8][0][in]; \ |
||||
state[9] ^= Matrix[9][0][in]; \ |
||||
state[10] ^= Matrix[10][0][in]; \ |
||||
state[11] ^= Matrix[11][0][in]; \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
round_lyra_v35(state); \ |
||||
} |
||||
|
||||
//// compute 30 version |
||||
#define reduceDuplexRowSetup_v30(rowIn, rowInOut, rowOut) { \ |
||||
for (int i = 0; i < 8; i++) { \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ |
||||
round_lyra_v30(state); \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \ |
||||
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ |
||||
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ |
||||
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ |
||||
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ |
||||
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ |
||||
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ |
||||
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ |
||||
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ |
||||
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ |
||||
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ |
||||
Matrix[10 + 12 * i][rowInOut] ^= state[9]; \ |
||||
Matrix[11 + 12 * i][rowInOut] ^= state[10]; \ |
||||
} \ |
||||
} |
||||
|
||||
#define reduceDuplexRow_v30(rowIn, rowInOut, rowOut) { \ |
||||
for (int i = 0; i < 8; i++) { \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ |
||||
round_lyra_v30(state); \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
Matrix[j + 12 * i][rowOut] ^= state[j]; \ |
||||
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ |
||||
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ |
||||
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ |
||||
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ |
||||
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ |
||||
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ |
||||
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ |
||||
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ |
||||
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ |
||||
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ |
||||
Matrix[10 + 12 * i][rowInOut] ^= state[9]; \ |
||||
Matrix[11 + 12 * i][rowInOut] ^= state[10]; \ |
||||
} \ |
||||
} |
||||
|
||||
#define absorbblock_v30(in) { \ |
||||
state[0] ^= Matrix[0][in]; \ |
||||
state[1] ^= Matrix[1][in]; \ |
||||
state[2] ^= Matrix[2][in]; \ |
||||
state[3] ^= Matrix[3][in]; \ |
||||
state[4] ^= Matrix[4][in]; \ |
||||
state[5] ^= Matrix[5][in]; \ |
||||
state[6] ^= Matrix[6][in]; \ |
||||
state[7] ^= Matrix[7][in]; \ |
||||
state[8] ^= Matrix[8][in]; \ |
||||
state[9] ^= Matrix[9][in]; \ |
||||
state[10] ^= Matrix[10][in]; \ |
||||
state[11] ^= Matrix[11][in]; \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
round_lyra_v30(state); \ |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d) |
||||
{ |
||||
a += b; d ^= a; d = ROR2(d, 32); |
||||
c += d; b ^= c; b = ROR2(b, 24); |
||||
a += b; d ^= a; d = ROR2(d, 16); |
||||
c += d; b ^= c; b = ROR2(b, 63); |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void Gfunc_v30(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d) |
||||
{ |
||||
a += b; d ^= a; d = ROTR64(d, 32); |
||||
c += d; b ^= c; b = ROTR64(b, 24); |
||||
a += b; d ^= a; d = ROTR64(d, 16); |
||||
c += d; b ^= c; b = ROTR64(b, 63); |
||||
} |
||||
|
||||
#define round_lyra_v35_new(state) { \ |
||||
Gfunc_v35(state[0], state[4], state[8], state[12]); \ |
||||
Gfunc_v35(state[1], state[5], state[9], state[13]); \ |
||||
Gfunc_v35(state[2], state[6], state[10], state[14]); \ |
||||
Gfunc_v35(state[3], state[7], state[11], state[15]); \ |
||||
Gfunc_v35(state[0], state[5], state[10], state[15]); \ |
||||
Gfunc_v35(state[1], state[6], state[11], state[12]); \ |
||||
Gfunc_v35(state[2], state[7], state[8], state[13]); \ |
||||
Gfunc_v35(state[3], state[4], state[9], state[14]); \ |
||||
} |
||||
|
||||
static __device__ __forceinline__ void round_lyra_v35(uint2 *s) |
||||
{ |
||||
Gfunc_v35(s[0], s[4], s[8], s[12]); |
||||
Gfunc_v35(s[1], s[5], s[9], s[13]); |
||||
Gfunc_v35(s[2], s[6], s[10], s[14]); |
||||
Gfunc_v35(s[3], s[7], s[11], s[15]); |
||||
Gfunc_v35(s[0], s[5], s[10], s[15]); |
||||
Gfunc_v35(s[1], s[6], s[11], s[12]); |
||||
Gfunc_v35(s[2], s[7], s[8], s[13]); |
||||
Gfunc_v35(s[3], s[4], s[9], s[14]); |
||||
} |
||||
|
||||
static __device__ __forceinline__ void round_lyra_v30(uint64_t *s) |
||||
{ |
||||
Gfunc_v30(s[0], s[4], s[8], s[12]); |
||||
Gfunc_v30(s[1], s[5], s[9], s[13]); |
||||
Gfunc_v30(s[2], s[6], s[10], s[14]); |
||||
Gfunc_v30(s[3], s[7], s[11], s[15]); |
||||
Gfunc_v30(s[0], s[5], s[10], s[15]); |
||||
Gfunc_v30(s[1], s[6], s[11], s[12]); |
||||
Gfunc_v30(s[2], s[7], s[8], s[13]); |
||||
Gfunc_v30(s[3], s[4], s[9], s[14]); |
||||
} |
||||
|
||||
__global__ __launch_bounds__(256, 1) |
||||
void lyra2_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outputHash) |
||||
{ |
||||
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
uint64_t state[16]; |
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { state[i] = outputHash[threads*i + thread]; } //password |
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt |
||||
#pragma unroll |
||||
for (int i = 0; i<8; i++) { state[i + 8] = devectorize(blake2b_IV[i]); } |
||||
|
||||
// blake2blyra x2 |
||||
#pragma unroll 24 |
||||
for (int i = 0; i<24; i++) { round_lyra_v30(state); } //because 12 is not enough |
||||
|
||||
uint64_t Matrix[96][8]; // not cool |
||||
// reducedSqueezeRow0 |
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) { |
||||
int idx = 84-12*i; |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { Matrix[j + idx][0] = state[j]; } |
||||
round_lyra_v30(state); |
||||
} |
||||
|
||||
// reducedSqueezeRow1 |
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
int idx0= 12*i; |
||||
int idx1= 84-idx0; |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + idx0][0]; } |
||||
round_lyra_v30(state); |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { Matrix[j + idx1][1] = Matrix[j + idx0][0] ^ state[j]; } |
||||
} |
||||
|
||||
reduceDuplexRowSetup_v30(1, 0, 2); |
||||
reduceDuplexRowSetup_v30(2, 1, 3); |
||||
reduceDuplexRowSetup_v30(3, 0, 4); |
||||
reduceDuplexRowSetup_v30(4, 3, 5); |
||||
reduceDuplexRowSetup_v30(5, 2, 6); |
||||
reduceDuplexRowSetup_v30(6, 1, 7); |
||||
|
||||
uint64_t rowa; |
||||
rowa = state[0] & 7; |
||||
reduceDuplexRow_v30(7, rowa, 0); |
||||
rowa = state[0] & 7; |
||||
reduceDuplexRow_v30(0, rowa, 3); |
||||
rowa = state[0] & 7; |
||||
reduceDuplexRow_v30(3, rowa, 6); |
||||
rowa = state[0] & 7; |
||||
reduceDuplexRow_v30(6, rowa, 1); |
||||
rowa = state[0] & 7; |
||||
reduceDuplexRow_v30(1, rowa, 4); |
||||
rowa = state[0] & 7; |
||||
reduceDuplexRow_v30(4, rowa, 7); |
||||
rowa = state[0] & 7; |
||||
reduceDuplexRow_v30(7, rowa, 2); |
||||
rowa = state[0] & 7; |
||||
reduceDuplexRow_v30(2, rowa, 5); |
||||
|
||||
absorbblock_v30(rowa); |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { |
||||
outputHash[threads*i + thread] = state[i]; |
||||
} //password |
||||
|
||||
} //thread |
||||
} |
||||
|
||||
__global__ __launch_bounds__(256, 1) |
||||
void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) |
||||
{ |
||||
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
uint2 state[16]; |
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password |
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt |
||||
#pragma unroll |
||||
for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; } |
||||
|
||||
// blake2blyra x2 |
||||
#pragma unroll 24 |
||||
for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough |
||||
|
||||
uint2 Matrix[96][8]; // not cool |
||||
|
||||
// reducedSqueezeRow0 |
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; } |
||||
round_lyra_v35(state); |
||||
} |
||||
|
||||
// reducedSqueezeRow1 |
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; } |
||||
round_lyra_v35(state); |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; } |
||||
} |
||||
|
||||
reduceDuplexRowSetup(1, 0, 2); |
||||
reduceDuplexRowSetup(2, 1, 3); |
||||
reduceDuplexRowSetup(3, 0, 4); |
||||
reduceDuplexRowSetup(4, 3, 5); |
||||
reduceDuplexRowSetup(5, 2, 6); |
||||
reduceDuplexRowSetup(6, 1, 7); |
||||
|
||||
uint32_t rowa; |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(7, rowa, 0); |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(0, rowa, 3); |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(3, rowa, 6); |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(6, rowa, 1); |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(1, rowa, 4); |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(4, rowa, 7); |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(7, rowa, 2); |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(2, rowa, 5); |
||||
|
||||
absorbblock(rowa); |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { |
||||
outputHash[threads*i + thread] = devectorize(state[i]); |
||||
} //password |
||||
|
||||
} //thread |
||||
} |
||||
|
||||
__global__ |
||||
void __launch_bounds__(256, 1) lyra2_gpu_hash_32_test(int threads, uint32_t startNounce, uint64_t *outputHash) |
||||
{ |
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
if (thread < threads) |
||||
{ |
||||
uint2 state[16]; |
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password |
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt |
||||
#pragma unroll |
||||
for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; } |
||||
|
||||
// blake2blyra x2 |
||||
#pragma unroll 24 |
||||
for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough |
||||
|
||||
uint2 Matrix[12][8][8]; // not cool |
||||
|
||||
// reducedSqueezeRow0 |
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) { |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { Matrix[j][7-i][0] = state[j]; } |
||||
round_lyra_v35(state); |
||||
} |
||||
|
||||
// reducedSqueezeRow1 |
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { state[j] ^= Matrix[j][i][0]; } |
||||
round_lyra_v35(state); |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { Matrix[j][7-i][1] = Matrix[j][i][0] ^ state[j]; } |
||||
} |
||||
|
||||
reduceDuplexRowSetup_test(1, 0, 2); |
||||
reduceDuplexRowSetup_test(2, 1, 3); |
||||
reduceDuplexRowSetup_test(3, 0, 4); |
||||
reduceDuplexRowSetup_test(4, 3, 5); |
||||
reduceDuplexRowSetup_test(5, 2, 6); |
||||
reduceDuplexRowSetup_test(6, 1, 7); |
||||
|
||||
uint64_t rowa; |
||||
rowa = devectorize(state[0]) & 7; |
||||
reduceDuplexRow_test(7, rowa, 0); |
||||
rowa = devectorize(state[0]) & 7; |
||||
reduceDuplexRow_test(0, rowa, 3); |
||||
rowa = devectorize(state[0]) & 7; |
||||
reduceDuplexRow_test(3, rowa, 6); |
||||
rowa = devectorize(state[0]) & 7; |
||||
reduceDuplexRow_test(6, rowa, 1); |
||||
rowa = devectorize(state[0]) & 7; |
||||
reduceDuplexRow_test(1, rowa, 4); |
||||
rowa = devectorize(state[0]) & 7; |
||||
reduceDuplexRow_test(4, rowa, 7); |
||||
rowa = devectorize(state[0]) & 7; |
||||
reduceDuplexRow_test(7, rowa, 2); |
||||
rowa = devectorize(state[0]) & 7; |
||||
reduceDuplexRow_test(2, rowa, 5); |
||||
|
||||
absorbblock_test(rowa); |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { |
||||
outputHash[threads*i + thread] = devectorize(state[i]); |
||||
} //password |
||||
|
||||
} //thread |
||||
} |
||||
|
||||
__host__ |
||||
void lyra2_cpu_init(int thr_id, int threads) |
||||
{ |
||||
//not used |
||||
} |
||||
|
||||
__host__ |
||||
void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order) |
||||
{ |
||||
const int threadsperblock = 256; |
||||
|
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock); |
||||
dim3 block(threadsperblock); |
||||
|
||||
if (device_sm[device_map[thr_id]] >= 350) { |
||||
lyra2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash); |
||||
} else { |
||||
// kernel for compute30 card |
||||
lyra2_gpu_hash_32_v30 <<<grid, block >>> (threads, startNounce, d_outputHash); |
||||
} |
||||
|
||||
cudaDeviceSynchronize(); |
||||
//MyStreamSynchronize(NULL, order, thr_id); |
||||
} |
||||
|
@ -0,0 +1,133 @@
@@ -0,0 +1,133 @@
|
||||
extern "C" { |
||||
#include "sph/sph_blake.h" |
||||
#include "sph/sph_groestl.h" |
||||
#include "sph/sph_skein.h" |
||||
#include "sph/sph_keccak.h" |
||||
#include "lyra2/Lyra2.h" |
||||
} |
||||
|
||||
#include "miner.h" |
||||
#include "cuda_helper.h" |
||||
|
||||
static _ALIGN(64) uint64_t *d_hash[8]; |
||||
|
||||
extern void quark_check_cpu_init(int thr_id, int threads); |
||||
extern void quark_check_cpu_setTarget(const void *ptarget); |
||||
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); |
||||
extern uint32_t quark_check_cpu_hash_64_2(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint64_t *d_inputHash, int order); |
||||
|
||||
extern void blake256_cpu_init(int thr_id, int threads); |
||||
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); |
||||
extern void blake256_cpu_setBlock_80(uint32_t *pdata); |
||||
extern void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||
extern void keccak256_cpu_init(int thr_id, int threads); |
||||
extern void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||
extern void skein256_cpu_init(int thr_id, int threads); |
||||
|
||||
extern void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order); |
||||
extern void lyra2_cpu_init(int thr_id, int threads); |
||||
|
||||
extern void groestl256_setTarget(const void *ptarget); |
||||
extern uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order); |
||||
extern void groestl256_cpu_init(int thr_id, int threads); |
||||
|
||||
extern "C" void lyra_hash(void *state, const void *input) |
||||
{ |
||||
sph_blake256_context ctx_blake; |
||||
sph_keccak256_context ctx_keccak; |
||||
sph_skein256_context ctx_skein; |
||||
sph_groestl256_context ctx_groestl; |
||||
|
||||
uint32_t hashA[8], hashB[8], hash[8]; |
||||
|
||||
sph_blake256_init(&ctx_blake); |
||||
sph_blake256(&ctx_blake, input, 80); |
||||
sph_blake256_close(&ctx_blake, hashA); |
||||
|
||||
sph_keccak256_init(&ctx_keccak); |
||||
sph_keccak256(&ctx_keccak, hashA, 32); |
||||
sph_keccak256_close(&ctx_keccak, hashB); |
||||
|
||||
LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); |
||||
|
||||
sph_skein256_init(&ctx_skein); |
||||
sph_skein256(&ctx_skein, hashA, 32); |
||||
sph_skein256_close(&ctx_skein, hashB); |
||||
|
||||
sph_groestl256_init(&ctx_groestl); |
||||
sph_groestl256(&ctx_groestl, hashB, 32); |
||||
sph_groestl256_close(&ctx_groestl, hash); |
||||
|
||||
// seems wrong : hash or hashB ? |
||||
memcpy(state, hashB, 32); |
||||
} |
||||
|
||||
static bool init[8] = { 0 }; |
||||
|
||||
extern "C" int scanhash_lyra(int thr_id, uint32_t *pdata, |
||||
const uint32_t *ptarget, uint32_t max_nonce, |
||||
unsigned long *hashes_done) |
||||
{ |
||||
const uint32_t first_nonce = pdata[19]; |
||||
int intensity = (device_sm[device_map[thr_id]] >= 500) ? 19 : 18; |
||||
int throughput = opt_work_size ? opt_work_size : (1 << intensity); // 18=256*256*4; |
||||
throughput = min(throughput, (int)(max_nonce - first_nonce)); |
||||
|
||||
if (opt_benchmark) |
||||
((uint32_t*)ptarget)[7] = 0x0000ff; |
||||
|
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(device_map[thr_id]); |
||||
|
||||
blake256_cpu_init(thr_id, throughput); |
||||
keccak256_cpu_init(thr_id,throughput); |
||||
skein256_cpu_init(thr_id, throughput); |
||||
groestl256_cpu_init(thr_id, throughput); |
||||
lyra2_cpu_init(thr_id, throughput); |
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
uint32_t endiandata[20]; |
||||
for (int k=0; k < 20; k++) |
||||
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); |
||||
|
||||
blake256_cpu_setBlock_80(pdata); |
||||
groestl256_setTarget(ptarget); |
||||
|
||||
do { |
||||
int order = 0; |
||||
uint32_t foundNonce; |
||||
|
||||
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
|
||||
foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
if (foundNonce != 0xffffffff) |
||||
{ |
||||
// const uint32_t Htarg = ptarget[6]; |
||||
uint32_t vhash64[8]; |
||||
be32enc(&endiandata[19], foundNonce); |
||||
lyra_hash(vhash64, endiandata); |
||||
|
||||
// if (vhash64[7]<=Htarg) { // && fulltest(vhash64, ptarget)) { |
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
pdata[19] = foundNonce; |
||||
return 1; |
||||
// } else { |
||||
// applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce); |
||||
// } |
||||
} |
||||
|
||||
pdata[19] += throughput; |
||||
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + 1; |
||||
return 0; |
||||
} |
Loading…
Reference in new issue