/* * Merged LUFFA512 64 + CUBE512 64 - from sp */ #include "cuda_helper.h" #define MULT0(a) {\ tmp = a[7]; \ a[7] = a[6]; \ a[6] = a[5]; \ a[5] = a[4]; \ a[4] = a[3] ^ tmp; \ a[3] = a[2] ^ tmp; \ a[2] = a[1]; \ a[1] = a[0] ^ tmp; \ a[0] = tmp; \ } #define MULT2(a,j) { \ tmp = a[(j<<3)+7]; \ a[(j*8)+7] = a[(j*8)+6]; \ a[(j*8)+6] = a[(j*8)+5]; \ a[(j*8)+5] = a[(j*8)+4]; \ a[(j*8)+4] = a[(j*8)+3] ^ tmp; \ a[(j*8)+3] = a[(j*8)+2] ^ tmp; \ a[(j*8)+2] = a[(j*8)+1]; \ a[(j*8)+1] = a[(j*8)+0] ^ tmp; \ a[j*8] = tmp; \ } #define TWEAK(a0,a1,a2,a3,j) { \ a0 = ROTL32(a0,j); \ a1 = ROTL32(a1,j); \ a2 = ROTL32(a2,j); \ a3 = ROTL32(a3,j); \ } #define STEP(c0,c1) { \ SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp); \ SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp); \ MIXWORD(chainv[0],chainv[4]); \ MIXWORD(chainv[1],chainv[5]); \ MIXWORD(chainv[2],chainv[6]); \ MIXWORD(chainv[3],chainv[7]); \ ADD_CONSTANT(chainv[0],chainv[4],c0,c1); \ } #define SUBCRUMB(a0,a1,a2,a3,a4) { \ a4 = a0; \ a0 |= a1; \ a2 ^= a3; \ a1 = ~a1;\ a0 ^= a3; \ a3 &= a4; \ a1 ^= a3; \ a3 ^= a2; \ a2 &= a0; \ a0 = ~a0;\ a2 ^= a1; \ a1 |= a3; \ a4 ^= a1; \ a3 ^= a2; \ a2 &= a1; \ a1 ^= a0; \ a0 = a4; \ } #define MIXWORD(a0,a4) { \ a4 ^= a0; \ a0 = ROTL32(a0,2); \ a0 ^= a4; \ a4 = ROTL32(a4,14); \ a4 ^= a0; \ a0 = ROTL32(a0,10); \ a0 ^= a4; \ a4 = ROTL32(a4,1); \ } #define ADD_CONSTANT(a0,b0,c0,c1) { \ a0 ^= c0; \ b0 ^= c1; \ } __device__ __constant__ uint32_t c_CNS[80] = { 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, 0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d, 0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4, 0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28, 0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b, 0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704, 0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72, 0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7, 0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719, 0xd9847356,0x36eda57f,0xa2c78434,0x703aace7, 0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91, 0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be, 0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5, 0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355, 0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab, 0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0, 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3, 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31 }; // Precalculated chaining values __device__ __constant__ uint32_t c_IV[40] = { 0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8, 0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e, 0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be, 0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02, 0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe, 0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4, 0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6, 0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428, 0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3, 0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529 }; /***************************************************/ __device__ __forceinline__ static void rnd512(uint32_t *statebuffer, uint32_t *statechainv) { uint32_t t[40]; uint32_t chainv[8]; uint32_t tmp; int i,j; #pragma unroll for(i=0;i<8;i++) { t[i] = 0; #pragma unroll 5 for(j=0;j<5;j++) t[i] ^= statechainv[i+8*j]; } MULT0(t); #pragma unroll for(j=0;j<5;j++) { #pragma unroll for(i=0;i<8;i++) statechainv[i+8*j] ^= t[i]; } #pragma unroll for(j=0;j<5;j++) { #pragma unroll for(i=0;i<8;i++) t[i+8*j] = statechainv[i+8*j]; } MULT0(statechainv); #pragma unroll 4 for(j=1;j<5;j++) { MULT2(statechainv, j); } #pragma unroll for(j=0;j<5;j++) { #pragma unroll for(i=0;i<8;i++) statechainv[8*j+i] ^= t[8*((j+1)%5)+i]; } #pragma unroll for(j=0;j<5;j++) { #pragma unroll for(i=0;i<8;i++) t[i+8*j] = statechainv[i+8*j]; } MULT0(statechainv); #pragma unroll 4 for(j=1;j<5;j++) { MULT2(statechainv, j); } #pragma unroll for(j=0;j<5;j++) { #pragma unroll for(i=0;i<8;i++) statechainv[8*j+i] ^= t[8*((j+4)%5)+i]; } #pragma unroll for(j=0;j<5;j++) { #pragma unroll 8 for(i=0;i<8;i++) statechainv[i+8*j] ^= statebuffer[i]; MULT0(statebuffer); } #pragma unroll for(i=0;i<8;i++) { chainv[i] = statechainv[i]; } #pragma unroll 1 for(i=0;i<8;i++) { STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); } #pragma unroll for(i=0;i<8;i++) { statechainv[i] = chainv[i]; chainv[i] = statechainv[i+8]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); #pragma unroll 1 for(i=0;i<8;i++) { STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); } #pragma unroll for(i=0;i<8;i++) { statechainv[i+8] = chainv[i]; chainv[i] = statechainv[i+16]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); #pragma unroll 1 for(i=0;i<8;i++) { STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); } #pragma unroll for(i=0;i<8;i++) { statechainv[i+16] = chainv[i]; chainv[i] = statechainv[i+24]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); #pragma unroll 1 for(i=0;i<8;i++) { STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); } #pragma unroll for(i=0;i<8;i++) { statechainv[i+24] = chainv[i]; chainv[i] = statechainv[i+32]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); #pragma unroll 1 for(i=0;i<8;i++) { STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); } #pragma unroll for(i=0;i<8;i++) { statechainv[i+32] = chainv[i]; } } __device__ __forceinline__ static void rnd512_first(uint32_t state[40], uint32_t buffer[8]) { uint32_t chainv[8]; uint32_t tmp; int i, j; for (j = 0; j<5; j++) { state[8 * j] ^= buffer[0]; #pragma unroll 7 for (i = 1; i<8; i++) state[i + 8 * j] ^= buffer[i]; MULT0(buffer); } #pragma unroll for (i = 0; i<8; i++) chainv[i] = state[i]; #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i] = chainv[i]; chainv[i] = state[i + 8]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i + 8] = chainv[i]; chainv[i] = state[i + 16]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i + 16] = chainv[i]; chainv[i] = state[i + 24]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i + 24] = chainv[i]; chainv[i] = state[i + 32]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); } #pragma unroll for (i = 0; i<8; i++) state[i + 32] = chainv[i]; } /***************************************************/ __device__ __forceinline__ static void rnd512_nullhash(uint32_t *state) { uint32_t t[40]; uint32_t chainv[8]; uint32_t tmp; int i, j; #pragma unroll for (i = 0; i<8; i++) { t[i] = state[i + 8 * 0]; #pragma unroll 4 for (j = 1; j<5; j++) t[i] ^= state[i + 8 * j]; } MULT0(t); #pragma unroll for (j = 0; j<5; j++) { #pragma unroll for (i = 0; i<8; i++) state[i + 8 * j] ^= t[i]; } #pragma unroll for (j = 0; j<5; j++) { #pragma unroll for (i = 0; i<8; i++) t[i + 8 * j] = state[i + 8 * j]; } MULT0(state); #pragma unroll 4 for(j=1; j<5; j++) { MULT2(state, j); } #pragma unroll for (j = 0; j<5; j++) { #pragma unroll for (i = 0; i<8; i++) state[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; } #pragma unroll for (j = 0; j<5; j++) { #pragma unroll 8 for (i = 0; i<8; i++) t[i + 8 * j] = state[i + 8 * j]; } MULT0(state); #pragma unroll 4 for(j=1; j<5; j++) { MULT2(state, j); } #pragma unroll for (j = 0; j<5; j++) { #pragma unroll for (i = 0; i<8; i++) state[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; } #pragma unroll for (i = 0; i<8; i++) chainv[i] = state[i]; #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i] = chainv[i]; chainv[i] = state[i + 8]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i + 8] = chainv[i]; chainv[i] = state[i + 16]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i + 16] = chainv[i]; chainv[i] = state[i + 24]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i + 24] = chainv[i]; chainv[i] = state[i + 32]; } TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); #pragma unroll 1 for (i = 0; i<8; i++) { STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); } #pragma unroll for (i = 0; i<8; i++) { state[i + 32] = chainv[i]; } } __device__ __forceinline__ static void Update512(uint32_t *statebuffer, uint32_t *statechainv, const uint32_t *data) { #pragma unroll for (int i = 0; i < 8; i++) statebuffer[i] = cuda_swab32(data[i]); rnd512_first(statechainv, statebuffer); #pragma unroll for (int i = 0; i < 8; i++) statebuffer[i] = cuda_swab32(data[i + 8]); rnd512(statebuffer, statechainv); } /***************************************************/ __device__ __forceinline__ static void finalization512(uint32_t *statebuffer, uint32_t *statechainv, uint32_t *b) { int i,j; statebuffer[0] = 0x80000000; #pragma unroll 7 for(int i=1;i<8;i++) statebuffer[i] = 0; rnd512(statebuffer, statechainv); /*---- blank round with m=0 ----*/ rnd512_nullhash(statechainv); #pragma unroll for(i=0;i<8;i++) { b[i] = statechainv[i]; #pragma unroll 4 for(j=1;j<5;j++) { b[i] ^= statechainv[i+8*j]; } b[i] = cuda_swab32((b[i])); } rnd512_nullhash(statechainv); #pragma unroll for(i=0;i<8;i++) { b[8 + i] = statechainv[i]; #pragma unroll 4 for(j=1;j<5;j++) { b[8+i] ^= statechainv[i+8*j]; } b[8 + i] = cuda_swab32((b[8 + i])); } } #define ROUND_EVEN { \ xg = (x0 + xg); \ x0 = ROTL32(x0, 7); \ xh = (x1 + xh); \ x1 = ROTL32(x1, 7); \ xi = (x2 + xi); \ x2 = ROTL32(x2, 7); \ xj = (x3 + xj); \ x3 = ROTL32(x3, 7); \ xk = (x4 + xk); \ x4 = ROTL32(x4, 7); \ xl = (x5 + xl); \ x5 = ROTL32(x5, 7); \ xm = (x6 + xm); \ x6 = ROTL32(x6, 7); \ xn = (x7 + xn); \ x7 = ROTL32(x7, 7); \ xo = (x8 + xo); \ x8 = ROTL32(x8, 7); \ xp = (x9 + xp); \ x9 = ROTL32(x9, 7); \ xq = (xa + xq); \ xa = ROTL32(xa, 7); \ xr = (xb + xr); \ xb = ROTL32(xb, 7); \ xs = (xc + xs); \ xc = ROTL32(xc, 7); \ xt = (xd + xt); \ xd = ROTL32(xd, 7); \ xu = (xe + xu); \ xe = ROTL32(xe, 7); \ xv = (xf + xv); \ xf = ROTL32(xf, 7); \ x8 ^= xg; \ x9 ^= xh; \ xa ^= xi; \ xb ^= xj; \ xc ^= xk; \ xd ^= xl; \ xe ^= xm; \ xf ^= xn; \ x0 ^= xo; \ x1 ^= xp; \ x2 ^= xq; \ x3 ^= xr; \ x4 ^= xs; \ x5 ^= xt; \ x6 ^= xu; \ x7 ^= xv; \ xi = (x8 + xi); \ x8 = ROTL32(x8, 11); \ xj = (x9 + xj); \ x9 = ROTL32(x9, 11); \ xg = (xa + xg); \ xa = ROTL32(xa, 11); \ xh = (xb + xh); \ xb = ROTL32(xb, 11); \ xm = (xc + xm); \ xc = ROTL32(xc, 11); \ xn = (xd + xn); \ xd = ROTL32(xd, 11); \ xk = (xe + xk); \ xe = ROTL32(xe, 11); \ xl = (xf + xl); \ xf = ROTL32(xf, 11); \ xq = (x0 + xq); \ x0 = ROTL32(x0, 11); \ xr = (x1 + xr); \ x1 = ROTL32(x1, 11); \ xo = (x2 + xo); \ x2 = ROTL32(x2, 11); \ xp = (x3 + xp); \ x3 = ROTL32(x3, 11); \ xu = (x4 + xu); \ x4 = ROTL32(x4, 11); \ xv = (x5 + xv); \ x5 = ROTL32(x5, 11); \ xs = (x6 + xs); \ x6 = ROTL32(x6, 11); \ xt = (x7 + xt); \ x7 = ROTL32(x7, 11); \ xc ^= xi; \ xd ^= xj; \ xe ^= xg; \ xf ^= xh; \ x8 ^= xm; \ x9 ^= xn; \ xa ^= xk; \ xb ^= xl; \ x4 ^= xq; \ x5 ^= xr; \ x6 ^= xo; \ x7 ^= xp; \ x0 ^= xu; \ x1 ^= xv; \ x2 ^= xs; \ x3 ^= xt; \ } #define ROUND_ODD { \ xj = (xc + xj); \ xc = ROTL32(xc, 7); \ xi = (xd + xi); \ xd = ROTL32(xd, 7); \ xh = (xe + xh); \ xe = ROTL32(xe, 7); \ xg = (xf + xg); \ xf = ROTL32(xf, 7); \ xn = (x8 + xn); \ x8 = ROTL32(x8, 7); \ xm = (x9 + xm); \ x9 = ROTL32(x9, 7); \ xl = (xa + xl); \ xa = ROTL32(xa, 7); \ xk = (xb + xk); \ xb = ROTL32(xb, 7); \ xr = (x4 + xr); \ x4 = ROTL32(x4, 7); \ xq = (x5 + xq); \ x5 = ROTL32(x5, 7); \ xp = (x6 + xp); \ x6 = ROTL32(x6, 7); \ xo = (x7 + xo); \ x7 = ROTL32(x7, 7); \ xv = (x0 + xv); \ x0 = ROTL32(x0, 7); \ xu = (x1 + xu); \ x1 = ROTL32(x1, 7); \ xt = (x2 + xt); \ x2 = ROTL32(x2, 7); \ xs = (x3 + xs); \ x3 = ROTL32(x3, 7); \ x4 ^= xj; \ x5 ^= xi; \ x6 ^= xh; \ x7 ^= xg; \ x0 ^= xn; \ x1 ^= xm; \ x2 ^= xl; \ x3 ^= xk; \ xc ^= xr; \ xd ^= xq; \ xe ^= xp; \ xf ^= xo; \ x8 ^= xv; \ x9 ^= xu; \ xa ^= xt; \ xb ^= xs; \ xh = (x4 + xh); \ x4 = ROTL32(x4, 11); \ xg = (x5 + xg); \ x5 = ROTL32(x5, 11); \ xj = (x6 + xj); \ x6 = ROTL32(x6, 11); \ xi = (x7 + xi); \ x7 = ROTL32(x7, 11); \ xl = (x0 + xl); \ x0 = ROTL32(x0, 11); \ xk = (x1 + xk); \ x1 = ROTL32(x1, 11); \ xn = (x2 + xn); \ x2 = ROTL32(x2, 11); \ xm = (x3 + xm); \ x3 = ROTL32(x3, 11); \ xp = (xc + xp); \ xc = ROTL32(xc, 11); \ xo = (xd + xo); \ xd = ROTL32(xd, 11); \ xr = (xe + xr); \ xe = ROTL32(xe, 11); \ xq = (xf + xq); \ xf = ROTL32(xf, 11); \ xt = (x8 + xt); \ x8 = ROTL32(x8, 11); \ xs = (x9 + xs); \ x9 = ROTL32(x9, 11); \ xv = (xa + xv); \ xa = ROTL32(xa, 11); \ xu = (xb + xu); \ xb = ROTL32(xb, 11); \ x0 ^= xh; \ x1 ^= xg; \ x2 ^= xj; \ x3 ^= xi; \ x4 ^= xl; \ x5 ^= xk; \ x6 ^= xn; \ x7 ^= xm; \ x8 ^= xp; \ x9 ^= xo; \ xa ^= xr; \ xb ^= xq; \ xc ^= xt; \ xd ^= xs; \ xe ^= xv; \ xf ^= xu; \ } #define SIXTEEN_ROUNDS \ for (int j = 0; j < 8; j ++) { \ ROUND_EVEN; \ ROUND_ODD; \ } __global__ #if __CUDA_ARCH__ > 500 __launch_bounds__(256, 4) #endif void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint32_t statechainv[40] = { 0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8, 0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e, 0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be, 0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02, 0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe, 0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4, 0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6, 0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428, 0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3, 0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529 }; uint32_t statebuffer[8]; uint32_t *const Hash = &g_hash[thread * 16U]; Update512(statebuffer, statechainv, Hash); finalization512(statebuffer, statechainv, Hash); //Cubehash uint32_t x0 = 0x2AEA2A61, x1 = 0x50F494D4, x2 = 0x2D538B8B, x3 = 0x4167D83E; uint32_t x4 = 0x3FEE2313, x5 = 0xC701CF8C, x6 = 0xCC39968E, x7 = 0x50AC5695; uint32_t x8 = 0x4D42C787, x9 = 0xA647A8B3, xa = 0x97CF0BEF, xb = 0x825B4537; uint32_t xc = 0xEEF864D2, xd = 0xF22090C4, xe = 0xD0E5CD33, xf = 0xA23911AE; uint32_t xg = 0xFCD398D9, xh = 0x148FE485, xi = 0x1B017BEF, xj = 0xB6444532; uint32_t xk = 0x6A536159, xl = 0x2FF5781C, xm = 0x91FA7934, xn = 0x0DBADEA9; uint32_t xo = 0xD65C8A2B, xp = 0xA5A70E75, xq = 0xB1C62456, xr = 0xBC796576; uint32_t xs = 0x1921C8F7, xt = 0xE7989AF1, xu = 0x7795D246, xv = 0xD43E3B44; x0 ^= Hash[0]; x1 ^= Hash[1]; x2 ^= Hash[2]; x3 ^= Hash[3]; x4 ^= Hash[4]; x5 ^= Hash[5]; x6 ^= Hash[6]; x7 ^= Hash[7]; SIXTEEN_ROUNDS; x0 ^= Hash[8]; x1 ^= Hash[9]; x2 ^= Hash[10]; x3 ^= Hash[11]; x4 ^= Hash[12]; x5 ^= Hash[13]; x6 ^= Hash[14]; x7 ^= Hash[15]; SIXTEEN_ROUNDS; x0 ^= 0x80; SIXTEEN_ROUNDS; xv ^= 1; for (int i = 3; i < 13; i++) { SIXTEEN_ROUNDS; } Hash[0] = x0; Hash[1] = x1; Hash[2] = x2; Hash[3] = x3; Hash[4] = x4; Hash[5] = x5; Hash[6] = x6; Hash[7] = x7; Hash[8] = x8; Hash[9] = x9; Hash[10] = xa; Hash[11] = xb; Hash[12] = xc; Hash[13] = xd; Hash[14] = xe; Hash[15] = xf; } } __host__ void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); x11_luffaCubehash512_gpu_hash_64 <<>> (threads, d_hash); MyStreamSynchronize(NULL, order, thr_id); } // Setup __host__ void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads) {}