GOSTCoin CUDA miner project, compatible with most nvidia cards, containing only gostd algo
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

834 lines
17 KiB

/*
* Merged LUFFA512 64 + CUBE512 64 - from sp
*/
#include "cuda_helper.h"
#define MULT0(a) {\
tmp = a[7]; \
a[7] = a[6]; \
a[6] = a[5]; \
a[5] = a[4]; \
a[4] = a[3] ^ tmp; \
a[3] = a[2] ^ tmp; \
a[2] = a[1]; \
a[1] = a[0] ^ tmp; \
a[0] = tmp; \
}
#define MULT2(a,j) { \
tmp = a[(j<<3)+7]; \
a[(j*8)+7] = a[(j*8)+6]; \
a[(j*8)+6] = a[(j*8)+5]; \
a[(j*8)+5] = a[(j*8)+4]; \
a[(j*8)+4] = a[(j*8)+3] ^ tmp; \
a[(j*8)+3] = a[(j*8)+2] ^ tmp; \
a[(j*8)+2] = a[(j*8)+1]; \
a[(j*8)+1] = a[(j*8)+0] ^ tmp; \
a[j*8] = tmp; \
}
#define TWEAK(a0,a1,a2,a3,j) { \
a0 = ROTL32(a0,j); \
a1 = ROTL32(a1,j); \
a2 = ROTL32(a2,j); \
a3 = ROTL32(a3,j); \
}
#define STEP(c0,c1) { \
SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp); \
SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp); \
MIXWORD(chainv[0],chainv[4]); \
MIXWORD(chainv[1],chainv[5]); \
MIXWORD(chainv[2],chainv[6]); \
MIXWORD(chainv[3],chainv[7]); \
ADD_CONSTANT(chainv[0],chainv[4],c0,c1); \
}
#define SUBCRUMB(a0,a1,a2,a3,a4) { \
a4 = a0; \
a0 |= a1; \
a2 ^= a3; \
a1 = ~a1;\
a0 ^= a3; \
a3 &= a4; \
a1 ^= a3; \
a3 ^= a2; \
a2 &= a0; \
a0 = ~a0;\
a2 ^= a1; \
a1 |= a3; \
a4 ^= a1; \
a3 ^= a2; \
a2 &= a1; \
a1 ^= a0; \
a0 = a4; \
}
#define MIXWORD(a0,a4) { \
a4 ^= a0; \
a0 = ROTL32(a0,2); \
a0 ^= a4; \
a4 = ROTL32(a4,14); \
a4 ^= a0; \
a0 = ROTL32(a0,10); \
a0 ^= a4; \
a4 = ROTL32(a4,1); \
}
#define ADD_CONSTANT(a0,b0,c0,c1) { \
a0 ^= c0; \
b0 ^= c1; \
}
__device__ __constant__ uint32_t c_CNS[80] = {
0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d,
0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4,
0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28,
0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b,
0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704,
0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72,
0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7,
0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719,
0xd9847356,0x36eda57f,0xa2c78434,0x703aace7,
0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91,
0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be,
0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5,
0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355,
0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab,
0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0,
0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31
};
// Precalculated chaining values
__device__ __constant__ uint32_t c_IV[40] = {
0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8,
0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e,
0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be,
0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02,
0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe,
0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4,
0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6,
0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428,
0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3,
0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529
};
/***************************************************/
__device__ __forceinline__
static void rnd512(uint32_t *statebuffer, uint32_t *statechainv)
{
uint32_t t[40];
uint32_t chainv[8];
uint32_t tmp;
int i,j;
#pragma unroll
for(i=0;i<8;i++) {
t[i] = 0;
#pragma unroll 5
for(j=0;j<5;j++)
t[i] ^= statechainv[i+8*j];
}
MULT0(t);
#pragma unroll
for(j=0;j<5;j++) {
#pragma unroll
for(i=0;i<8;i++)
statechainv[i+8*j] ^= t[i];
}
#pragma unroll
for(j=0;j<5;j++) {
#pragma unroll
for(i=0;i<8;i++)
t[i+8*j] = statechainv[i+8*j];
}
MULT0(statechainv);
#pragma unroll 4
for(j=1;j<5;j++) {
MULT2(statechainv, j);
}
#pragma unroll
for(j=0;j<5;j++) {
#pragma unroll
for(i=0;i<8;i++)
statechainv[8*j+i] ^= t[8*((j+1)%5)+i];
}
#pragma unroll
for(j=0;j<5;j++) {
#pragma unroll
for(i=0;i<8;i++)
t[i+8*j] = statechainv[i+8*j];
}
MULT0(statechainv);
#pragma unroll 4
for(j=1;j<5;j++) {
MULT2(statechainv, j);
}
#pragma unroll
for(j=0;j<5;j++) {
#pragma unroll
for(i=0;i<8;i++)
statechainv[8*j+i] ^= t[8*((j+4)%5)+i];
}
#pragma unroll
for(j=0;j<5;j++) {
#pragma unroll 8
for(i=0;i<8;i++)
statechainv[i+8*j] ^= statebuffer[i];
MULT0(statebuffer);
}
#pragma unroll
for(i=0;i<8;i++) {
chainv[i] = statechainv[i];
}
#pragma unroll 1
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]);
}
#pragma unroll
for(i=0;i<8;i++) {
statechainv[i] = chainv[i];
chainv[i] = statechainv[i+8];
}
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
#pragma unroll 1
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]);
}
#pragma unroll
for(i=0;i<8;i++) {
statechainv[i+8] = chainv[i];
chainv[i] = statechainv[i+16];
}
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
#pragma unroll 1
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
}
#pragma unroll
for(i=0;i<8;i++) {
statechainv[i+16] = chainv[i];
chainv[i] = statechainv[i+24];
}
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
#pragma unroll 1
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
}
#pragma unroll
for(i=0;i<8;i++) {
statechainv[i+24] = chainv[i];
chainv[i] = statechainv[i+32];
}
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
#pragma unroll 1
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
}
#pragma unroll
for(i=0;i<8;i++) {
statechainv[i+32] = chainv[i];
}
}
__device__ __forceinline__
static void rnd512_first(uint32_t state[40], uint32_t buffer[8])
{
uint32_t chainv[8];
uint32_t tmp;
int i, j;
for (j = 0; j<5; j++) {
state[8 * j] ^= buffer[0];
#pragma unroll 7
for (i = 1; i<8; i++)
state[i + 8 * j] ^= buffer[i];
MULT0(buffer);
}
#pragma unroll
for (i = 0; i<8; i++)
chainv[i] = state[i];
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i] = chainv[i];
chainv[i] = state[i + 8];
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i + 8] = chainv[i];
chainv[i] = state[i + 16];
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i + 16] = chainv[i];
chainv[i] = state[i + 24];
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i + 24] = chainv[i];
chainv[i] = state[i + 32];
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
}
#pragma unroll
for (i = 0; i<8; i++)
state[i + 32] = chainv[i];
}
/***************************************************/
__device__ __forceinline__
static void rnd512_nullhash(uint32_t *state)
{
uint32_t t[40];
uint32_t chainv[8];
uint32_t tmp;
int i, j;
#pragma unroll
for (i = 0; i<8; i++) {
t[i] = state[i + 8 * 0];
#pragma unroll 4
for (j = 1; j<5; j++)
t[i] ^= state[i + 8 * j];
}
MULT0(t);
#pragma unroll
for (j = 0; j<5; j++) {
#pragma unroll
for (i = 0; i<8; i++)
state[i + 8 * j] ^= t[i];
}
#pragma unroll
for (j = 0; j<5; j++) {
#pragma unroll
for (i = 0; i<8; i++)
t[i + 8 * j] = state[i + 8 * j];
}
MULT0(state);
#pragma unroll 4
for(j=1; j<5; j++) {
MULT2(state, j);
}
#pragma unroll
for (j = 0; j<5; j++) {
#pragma unroll
for (i = 0; i<8; i++)
state[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
}
#pragma unroll
for (j = 0; j<5; j++) {
#pragma unroll 8
for (i = 0; i<8; i++)
t[i + 8 * j] = state[i + 8 * j];
}
MULT0(state);
#pragma unroll 4
for(j=1; j<5; j++) {
MULT2(state, j);
}
#pragma unroll
for (j = 0; j<5; j++) {
#pragma unroll
for (i = 0; i<8; i++)
state[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
}
#pragma unroll
for (i = 0; i<8; i++)
chainv[i] = state[i];
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i] = chainv[i];
chainv[i] = state[i + 8];
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i + 8] = chainv[i];
chainv[i] = state[i + 16];
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i + 16] = chainv[i];
chainv[i] = state[i + 24];
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i + 24] = chainv[i];
chainv[i] = state[i + 32];
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
#pragma unroll 1
for (i = 0; i<8; i++) {
STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]);
}
#pragma unroll
for (i = 0; i<8; i++) {
state[i + 32] = chainv[i];
}
}
__device__ __forceinline__
static void Update512(uint32_t *statebuffer, uint32_t *statechainv, const uint32_t *data)
{
#pragma unroll
for (int i = 0; i < 8; i++) statebuffer[i] = cuda_swab32(data[i]);
rnd512_first(statechainv, statebuffer);
#pragma unroll
for (int i = 0; i < 8; i++) statebuffer[i] = cuda_swab32(data[i + 8]);
rnd512(statebuffer, statechainv);
}
/***************************************************/
__device__ __forceinline__
static void finalization512(uint32_t *statebuffer, uint32_t *statechainv, uint32_t *b)
{
int i,j;
statebuffer[0] = 0x80000000;
#pragma unroll 7
for(int i=1;i<8;i++) statebuffer[i] = 0;
rnd512(statebuffer, statechainv);
/*---- blank round with m=0 ----*/
rnd512_nullhash(statechainv);
#pragma unroll
for(i=0;i<8;i++) {
b[i] = statechainv[i];
#pragma unroll 4
for(j=1;j<5;j++) {
b[i] ^= statechainv[i+8*j];
}
b[i] = cuda_swab32((b[i]));
}
rnd512_nullhash(statechainv);
#pragma unroll
for(i=0;i<8;i++) {
b[8 + i] = statechainv[i];
#pragma unroll 4
for(j=1;j<5;j++) {
b[8+i] ^= statechainv[i+8*j];
}
b[8 + i] = cuda_swab32((b[8 + i]));
}
}
#define ROUND_EVEN { \
xg = (x0 + xg); \
x0 = ROTL32(x0, 7); \
xh = (x1 + xh); \
x1 = ROTL32(x1, 7); \
xi = (x2 + xi); \
x2 = ROTL32(x2, 7); \
xj = (x3 + xj); \
x3 = ROTL32(x3, 7); \
xk = (x4 + xk); \
x4 = ROTL32(x4, 7); \
xl = (x5 + xl); \
x5 = ROTL32(x5, 7); \
xm = (x6 + xm); \
x6 = ROTL32(x6, 7); \
xn = (x7 + xn); \
x7 = ROTL32(x7, 7); \
xo = (x8 + xo); \
x8 = ROTL32(x8, 7); \
xp = (x9 + xp); \
x9 = ROTL32(x9, 7); \
xq = (xa + xq); \
xa = ROTL32(xa, 7); \
xr = (xb + xr); \
xb = ROTL32(xb, 7); \
xs = (xc + xs); \
xc = ROTL32(xc, 7); \
xt = (xd + xt); \
xd = ROTL32(xd, 7); \
xu = (xe + xu); \
xe = ROTL32(xe, 7); \
xv = (xf + xv); \
xf = ROTL32(xf, 7); \
x8 ^= xg; \
x9 ^= xh; \
xa ^= xi; \
xb ^= xj; \
xc ^= xk; \
xd ^= xl; \
xe ^= xm; \
xf ^= xn; \
x0 ^= xo; \
x1 ^= xp; \
x2 ^= xq; \
x3 ^= xr; \
x4 ^= xs; \
x5 ^= xt; \
x6 ^= xu; \
x7 ^= xv; \
xi = (x8 + xi); \
x8 = ROTL32(x8, 11); \
xj = (x9 + xj); \
x9 = ROTL32(x9, 11); \
xg = (xa + xg); \
xa = ROTL32(xa, 11); \
xh = (xb + xh); \
xb = ROTL32(xb, 11); \
xm = (xc + xm); \
xc = ROTL32(xc, 11); \
xn = (xd + xn); \
xd = ROTL32(xd, 11); \
xk = (xe + xk); \
xe = ROTL32(xe, 11); \
xl = (xf + xl); \
xf = ROTL32(xf, 11); \
xq = (x0 + xq); \
x0 = ROTL32(x0, 11); \
xr = (x1 + xr); \
x1 = ROTL32(x1, 11); \
xo = (x2 + xo); \
x2 = ROTL32(x2, 11); \
xp = (x3 + xp); \
x3 = ROTL32(x3, 11); \
xu = (x4 + xu); \
x4 = ROTL32(x4, 11); \
xv = (x5 + xv); \
x5 = ROTL32(x5, 11); \
xs = (x6 + xs); \
x6 = ROTL32(x6, 11); \
xt = (x7 + xt); \
x7 = ROTL32(x7, 11); \
xc ^= xi; \
xd ^= xj; \
xe ^= xg; \
xf ^= xh; \
x8 ^= xm; \
x9 ^= xn; \
xa ^= xk; \
xb ^= xl; \
x4 ^= xq; \
x5 ^= xr; \
x6 ^= xo; \
x7 ^= xp; \
x0 ^= xu; \
x1 ^= xv; \
x2 ^= xs; \
x3 ^= xt; \
}
#define ROUND_ODD { \
xj = (xc + xj); \
xc = ROTL32(xc, 7); \
xi = (xd + xi); \
xd = ROTL32(xd, 7); \
xh = (xe + xh); \
xe = ROTL32(xe, 7); \
xg = (xf + xg); \
xf = ROTL32(xf, 7); \
xn = (x8 + xn); \
x8 = ROTL32(x8, 7); \
xm = (x9 + xm); \
x9 = ROTL32(x9, 7); \
xl = (xa + xl); \
xa = ROTL32(xa, 7); \
xk = (xb + xk); \
xb = ROTL32(xb, 7); \
xr = (x4 + xr); \
x4 = ROTL32(x4, 7); \
xq = (x5 + xq); \
x5 = ROTL32(x5, 7); \
xp = (x6 + xp); \
x6 = ROTL32(x6, 7); \
xo = (x7 + xo); \
x7 = ROTL32(x7, 7); \
xv = (x0 + xv); \
x0 = ROTL32(x0, 7); \
xu = (x1 + xu); \
x1 = ROTL32(x1, 7); \
xt = (x2 + xt); \
x2 = ROTL32(x2, 7); \
xs = (x3 + xs); \
x3 = ROTL32(x3, 7); \
x4 ^= xj; \
x5 ^= xi; \
x6 ^= xh; \
x7 ^= xg; \
x0 ^= xn; \
x1 ^= xm; \
x2 ^= xl; \
x3 ^= xk; \
xc ^= xr; \
xd ^= xq; \
xe ^= xp; \
xf ^= xo; \
x8 ^= xv; \
x9 ^= xu; \
xa ^= xt; \
xb ^= xs; \
xh = (x4 + xh); \
x4 = ROTL32(x4, 11); \
xg = (x5 + xg); \
x5 = ROTL32(x5, 11); \
xj = (x6 + xj); \
x6 = ROTL32(x6, 11); \
xi = (x7 + xi); \
x7 = ROTL32(x7, 11); \
xl = (x0 + xl); \
x0 = ROTL32(x0, 11); \
xk = (x1 + xk); \
x1 = ROTL32(x1, 11); \
xn = (x2 + xn); \
x2 = ROTL32(x2, 11); \
xm = (x3 + xm); \
x3 = ROTL32(x3, 11); \
xp = (xc + xp); \
xc = ROTL32(xc, 11); \
xo = (xd + xo); \
xd = ROTL32(xd, 11); \
xr = (xe + xr); \
xe = ROTL32(xe, 11); \
xq = (xf + xq); \
xf = ROTL32(xf, 11); \
xt = (x8 + xt); \
x8 = ROTL32(x8, 11); \
xs = (x9 + xs); \
x9 = ROTL32(x9, 11); \
xv = (xa + xv); \
xa = ROTL32(xa, 11); \
xu = (xb + xu); \
xb = ROTL32(xb, 11); \
x0 ^= xh; \
x1 ^= xg; \
x2 ^= xj; \
x3 ^= xi; \
x4 ^= xl; \
x5 ^= xk; \
x6 ^= xn; \
x7 ^= xm; \
x8 ^= xp; \
x9 ^= xo; \
xa ^= xr; \
xb ^= xq; \
xc ^= xt; \
xd ^= xs; \
xe ^= xv; \
xf ^= xu; \
}
#define SIXTEEN_ROUNDS \
for (int j = 0; j < 8; j ++) { \
ROUND_EVEN; \
ROUND_ODD; \
}
__global__
#if __CUDA_ARCH__ > 500
__launch_bounds__(256, 4)
#endif
void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t statechainv[40] = {
0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8,
0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e,
0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be,
0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02,
0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe,
0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4,
0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6,
0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428,
0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3,
0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529
};
uint32_t statebuffer[8];
uint32_t *const Hash = &g_hash[thread * 16U];
Update512(statebuffer, statechainv, Hash);
finalization512(statebuffer, statechainv, Hash);
//Cubehash
uint32_t x0 = 0x2AEA2A61, x1 = 0x50F494D4, x2 = 0x2D538B8B, x3 = 0x4167D83E;
uint32_t x4 = 0x3FEE2313, x5 = 0xC701CF8C, x6 = 0xCC39968E, x7 = 0x50AC5695;
uint32_t x8 = 0x4D42C787, x9 = 0xA647A8B3, xa = 0x97CF0BEF, xb = 0x825B4537;
uint32_t xc = 0xEEF864D2, xd = 0xF22090C4, xe = 0xD0E5CD33, xf = 0xA23911AE;
uint32_t xg = 0xFCD398D9, xh = 0x148FE485, xi = 0x1B017BEF, xj = 0xB6444532;
uint32_t xk = 0x6A536159, xl = 0x2FF5781C, xm = 0x91FA7934, xn = 0x0DBADEA9;
uint32_t xo = 0xD65C8A2B, xp = 0xA5A70E75, xq = 0xB1C62456, xr = 0xBC796576;
uint32_t xs = 0x1921C8F7, xt = 0xE7989AF1, xu = 0x7795D246, xv = 0xD43E3B44;
x0 ^= Hash[0];
x1 ^= Hash[1];
x2 ^= Hash[2];
x3 ^= Hash[3];
x4 ^= Hash[4];
x5 ^= Hash[5];
x6 ^= Hash[6];
x7 ^= Hash[7];
SIXTEEN_ROUNDS;
x0 ^= Hash[8];
x1 ^= Hash[9];
x2 ^= Hash[10];
x3 ^= Hash[11];
x4 ^= Hash[12];
x5 ^= Hash[13];
x6 ^= Hash[14];
x7 ^= Hash[15];
SIXTEEN_ROUNDS;
x0 ^= 0x80;
SIXTEEN_ROUNDS;
xv ^= 1;
for (int i = 3; i < 13; i++) {
SIXTEEN_ROUNDS;
}
Hash[0] = x0;
Hash[1] = x1;
Hash[2] = x2;
Hash[3] = x3;
Hash[4] = x4;
Hash[5] = x5;
Hash[6] = x6;
Hash[7] = x7;
Hash[8] = x8;
Hash[9] = x9;
Hash[10] = xa;
Hash[11] = xb;
Hash[12] = xc;
Hash[13] = xd;
Hash[14] = xe;
Hash[15] = xf;
}
}
__host__
void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x11_luffaCubehash512_gpu_hash_64 <<<grid, block>>> (threads, d_hash);
MyStreamSynchronize(NULL, order, thr_id);
}
// Setup
__host__
void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads) {}