From 42bcb91ca0b2ee853a4bad91a6fedffefaa32773 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 17 Jun 2015 01:50:30 +0200 Subject: [PATCH] x11: update sp luffa/cube to get closer x11 speeds.. i had to clean it... lot of unused defines... --- x11/cuda_x11_luffa512_Cubehash.cu | 1140 +++++++++++++++++------------ x11/x11.cu | 4 +- x13/x13.cu | 4 +- x15/x14.cu | 4 +- x15/x15.cu | 4 +- x17/x17.cu | 4 +- 6 files changed, 676 insertions(+), 484 deletions(-) diff --git a/x11/cuda_x11_luffa512_Cubehash.cu b/x11/cuda_x11_luffa512_Cubehash.cu index d9b9e80..2fc868c 100644 --- a/x11/cuda_x11_luffa512_Cubehash.cu +++ b/x11/cuda_x11_luffa512_Cubehash.cu @@ -1,21 +1,5 @@ /* - * luffa_for_32.c - * Version 2.0 (Sep 15th 2009) - * - * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. - * - * Hitachi, Ltd. is the owner of this software and hereby grant - * the U.S. Government and any interested party the right to use - * this software for the purposes of the SHA-3 evaluation process, - * notwithstanding that this software is copyrighted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * Merged LUFFA512 64 + CUBE512 64 */ #include "cuda_helper.h" @@ -27,87 +11,84 @@ typedef struct { uint32_t chainv[40]; /* Chaining values */ } hashState; -#define MULT2(a,j)\ - tmp = a[7+(8*j)];\ - a[7+(8*j)] = a[6+(8*j)];\ - a[6+(8*j)] = a[5+(8*j)];\ - a[5+(8*j)] = a[4+(8*j)];\ - a[4+(8*j)] = a[3+(8*j)] ^ tmp;\ - a[3+(8*j)] = a[2+(8*j)] ^ tmp;\ - a[2+(8*j)] = a[1+(8*j)];\ - a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ - a[0+(8*j)] = tmp; - -#if __CUDA_ARCH__ < 350 -#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) -#else -#define LROT(x, bits) __funnelshift_l(x, x, bits) -#endif - -#define TWEAK(a0,a1,a2,a3,j)\ - a0 = LROT(a0,j);\ - a1 = LROT(a1,j);\ - a2 = LROT(a2,j);\ - a3 = LROT(a3,j); - -#define STEP(c0,c1)\ - SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ - SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ - MIXWORD(chainv[0],chainv[4]);\ - MIXWORD(chainv[1],chainv[5]);\ - MIXWORD(chainv[2],chainv[6]);\ - MIXWORD(chainv[3],chainv[7]);\ - ADD_CONSTANT(chainv[0],chainv[4],c0,c1); - -#define SUBCRUMB(a0,a1,a2,a3,a4)\ - a4 = a0;\ - a0 |= a1;\ - a2 ^= a3;\ +#define MULT0(a) {\ + tmp = a[7]; \ + a[7] = a[6]; \ + a[6] = a[5]; \ + a[5] = a[4]; \ + a[4] = a[3] ^ tmp; \ + a[3] = a[2] ^ tmp; \ + a[2] = a[1]; \ + a[1] = a[0] ^ tmp; \ + a[0] = tmp; \ +} + +#define MULT2(a,j) { \ + tmp = a[(j<<3)+7]; \ + a[(j*8)+7] = a[(j*8)+6]; \ + a[(j*8)+6] = a[(j*8)+5]; \ + a[(j*8)+5] = a[(j*8)+4]; \ + a[(j*8)+4] = a[(j*8)+3] ^ tmp; \ + a[(j*8)+3] = a[(j*8)+2] ^ tmp; \ + a[(j*8)+2] = a[(j*8)+1]; \ + a[(j*8)+1] = a[(j*8)+0] ^ tmp; \ + a[j*8] = tmp; \ +} + +#define TWEAK(a0,a1,a2,a3,j) { \ + a0 = ROTL32(a0,j); \ + a1 = ROTL32(a1,j); \ + a2 = ROTL32(a2,j); \ + a3 = ROTL32(a3,j); \ +} + +#define STEP(c0,c1) { \ + SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp); \ + SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp); \ + MIXWORD(chainv[0],chainv[4]); \ + MIXWORD(chainv[1],chainv[5]); \ + MIXWORD(chainv[2],chainv[6]); \ + MIXWORD(chainv[3],chainv[7]); \ + ADD_CONSTANT(chainv[0],chainv[4],c0,c1); \ +} + +#define SUBCRUMB(a0,a1,a2,a3,a4) { \ + a4 = a0; \ + a0 |= a1; \ + a2 ^= a3; \ a1 = ~a1;\ - a0 ^= a3;\ - a3 &= a4;\ - a1 ^= a3;\ - a3 ^= a2;\ - a2 &= a0;\ + a0 ^= a3; \ + a3 &= a4; \ + a1 ^= a3; \ + a3 ^= a2; \ + a2 &= a0; \ a0 = ~a0;\ - a2 ^= a1;\ - a1 |= a3;\ - a4 ^= a1;\ - a3 ^= a2;\ - a2 &= a1;\ - a1 ^= a0;\ - a0 = a4; - -#define MIXWORD(a0,a4)\ - a4 ^= a0;\ - a0 = LROT(a0,2);\ - a0 ^= a4;\ - a4 = LROT(a4,14);\ - a4 ^= a0;\ - a0 = LROT(a0,10);\ - a0 ^= a4;\ - a4 = LROT(a4,1); - -#define ADD_CONSTANT(a0,b0,c0,c1)\ - a0 ^= c0;\ - b0 ^= c1; - -/* initial values of chaining variables */ -__device__ static __constant__ uint32_t c_IV[40]; -static const uint32_t h_IV[40] = { - 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, - 0x6e292011,0x90152df4,0xee058139,0xdef610bb, - 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, - 0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581, - 0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05, - 0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7, - 0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67, - 0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce, - 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, - 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; - -__device__ static __constant__ uint32_t c_CNS[80]; -static const uint32_t h_CNS[80] = { + a2 ^= a1; \ + a1 |= a3; \ + a4 ^= a1; \ + a3 ^= a2; \ + a2 &= a1; \ + a1 ^= a0; \ + a0 = a4; \ +} + +#define MIXWORD(a0,a4) { \ + a4 ^= a0; \ + a0 = ROTL32(a0,2); \ + a0 ^= a4; \ + a4 = ROTL32(a4,14); \ + a4 ^= a0; \ + a0 = ROTL32(a0,10); \ + a0 ^= a4; \ + a4 = ROTL32(a4,1); \ +} + +#define ADD_CONSTANT(a0,b0,c0,c1) { \ + a0 ^= c0; \ + b0 ^= c1; \ +} + +__device__ __constant__ uint32_t c_CNS[80] = { 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, @@ -127,519 +108,730 @@ static const uint32_t h_CNS[80] = { 0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab, 0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0, 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3, - 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31}; + 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31 +}; +// Precalculated chaining values +__device__ __constant__ uint32_t c_IV[40] = { + 0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8, + 0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e, + 0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be, + 0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02, + 0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe, + 0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4, + 0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6, + 0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428, + 0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3, + 0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529 +}; /***************************************************/ __device__ __forceinline__ -static void rnd512(hashState *state) +static void rnd512(uint32_t *statebuffer, uint32_t *statechainv) { - int i,j; uint32_t t[40]; uint32_t chainv[8]; uint32_t tmp; + int i,j; -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { - t[i]=0; -#pragma unroll 5 - for(j=0;j<5;j++) { - t[i] ^= state->chainv[i+8*j]; - } + t[i] = 0; + #pragma unroll 5 + for(j=0;j<5;j++) + t[i] ^= statechainv[i+8*j]; } - MULT2(t, 0); + MULT0(t); -#pragma unroll 5 + #pragma unroll for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= t[i]; - } + #pragma unroll + for(i=0;i<8;i++) + statechainv[i+8*j] ^= t[i]; } -#pragma unroll 5 + #pragma unroll for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; - } + #pragma unroll + for(i=0;i<8;i++) + t[i+8*j] = statechainv[i+8*j]; } -#pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); + MULT0(statechainv); + #pragma unroll 4 + for(j=1;j<5;j++) { + MULT2(statechainv, j); } -#pragma unroll 5 + #pragma unroll for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; - } + #pragma unroll + for(i=0;i<8;i++) + statechainv[8*j+i] ^= t[8*((j+1)%5)+i]; } -#pragma unroll 5 + #pragma unroll for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; - } + #pragma unroll + for(i=0;i<8;i++) + t[i+8*j] = statechainv[i+8*j]; } -#pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); + MULT0(statechainv); + #pragma unroll 4 + for(j=1;j<5;j++) { + MULT2(statechainv, j); } -#pragma unroll 5 + #pragma unroll for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; - } + #pragma unroll + for(i=0;i<8;i++) + statechainv[8*j+i] ^= t[8*((j+4)%5)+i]; } -#pragma unroll 5 + #pragma unroll for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= state->buffer[i]; - } - MULT2(state->buffer, 0); + #pragma unroll 8 + for(i=0;i<8;i++) + statechainv[i+8*j] ^= statebuffer[i]; + MULT0(statebuffer); } -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { - chainv[i] = state->chainv[i]; + chainv[i] = statechainv[i]; } -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); } -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { - state->chainv[i] = chainv[i]; - chainv[i] = state->chainv[i+8]; + statechainv[i] = chainv[i]; + chainv[i] = statechainv[i+8]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); } -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { - state->chainv[i+8] = chainv[i]; - chainv[i] = state->chainv[i+16]; + statechainv[i+8] = chainv[i]; + chainv[i] = statechainv[i+16]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); } -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { - state->chainv[i+16] = chainv[i]; - chainv[i] = state->chainv[i+24]; + statechainv[i+16] = chainv[i]; + chainv[i] = statechainv[i+24]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); } -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { - state->chainv[i+24] = chainv[i]; - chainv[i] = state->chainv[i+32]; + statechainv[i+24] = chainv[i]; + chainv[i] = statechainv[i+32]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); } -#pragma unroll 8 + #pragma unroll for(i=0;i<8;i++) { - state->chainv[i+32] = chainv[i]; + statechainv[i+32] = chainv[i]; } } - __device__ __forceinline__ -static void Update512(hashState *state, const BitSequence *data) +static void rnd512_first(uint32_t state[40], uint32_t buffer[8]) { -#pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]); - rnd512(state); + uint32_t chainv[8]; + uint32_t tmp; + int i, j; -#pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]); - rnd512(state); -} + for (j = 0; j<5; j++) { + state[8 * j] ^= buffer[0]; + #pragma unroll 7 + for (i = 1; i<8; i++) + state[i + 8 * j] ^= buffer[i]; + MULT0(buffer); + } -/***************************************************/ -__device__ __forceinline__ -static void finalization512(hashState *state, uint32_t *b) -{ - int i,j; + #pragma unroll + for (i = 0; i<8; i++) + chainv[i] = state[i]; - state->buffer[0] = 0x80000000; -#pragma unroll 7 - for(int i=1;i<8;i++) state->buffer[i] = 0; - rnd512(state); + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); + } - /*---- blank round with m=0 ----*/ -#pragma unroll 8 - for(i=0;i<8;i++) state->buffer[i] =0; - rnd512(state); + #pragma unroll + for (i = 0; i<8; i++) { + state[i] = chainv[i]; + chainv[i] = state[i + 8]; + } -#pragma unroll 8 - for(i=0;i<8;i++) { - b[i] = 0; -#pragma unroll 5 - for(j=0;j<5;j++) { - b[i] ^= state->chainv[i+8*j]; - } - b[i] = cuda_swab32((b[i])); + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); } -#pragma unroll 8 - for(i=0;i<8;i++) state->buffer[i]=0; - rnd512(state); + #pragma unroll + for (i = 0; i<8; i++) { + state[i + 8] = chainv[i]; + chainv[i] = state[i + 16]; + } -#pragma unroll 8 - for(i=0;i<8;i++) { - b[8+i] = 0; -#pragma unroll 5 - for(j=0;j<5;j++) { - b[8+i] ^= state->chainv[i+8*j]; - } - b[8 + i] = cuda_swab32((b[8 + i])); + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); } -} -//typedef unsigned char BitSequence; - -#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ -#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ - -#if __CUDA_ARCH__ < 350 -#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) -#else -#define LROT(x, bits) __funnelshift_l(x, x, bits) -#endif - -#define ROTATEUPWARDS7(a) LROT(a,7) -#define ROTATEUPWARDS11(a) LROT(a,11) - -#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } - -__device__ __constant__ -static const uint32_t c_IV_512[32] = { - 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, - 0x4167D83E, 0x3FEE2313, 0xC701CF8C, - 0xCC39968E, 0x50AC5695, 0x4D42C787, - 0xA647A8B3, 0x97CF0BEF, 0x825B4537, - 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, - 0xA23911AE, 0xFCD398D9, 0x148FE485, - 0x1B017BEF, 0xB6444532, 0x6A536159, - 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, - 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, - 0xBC796576, 0x1921C8F7, 0xE7989AF1, - 0x7795D246, 0xD43E3B44 -}; + #pragma unroll + for (i = 0; i<8; i++) { + state[i + 16] = chainv[i]; + chainv[i] = state[i + 24]; + } -static __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2]) -{ - int r; - int j; - int k; - int l; - int m; - - //#pragma unroll 16 - for (r = 0; r < CUBEHASH_ROUNDS; ++r) { - - /* "add x_0jklm into x_1jklmn modulo 2^32" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; - - /* "rotate x_0jklm upwards by 7 bits" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); - - /* "swap x_00klm with x_01klm" */ -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jk0m with x_1jk1m" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) - - /* "add x_0jklm into x_1jklm modulo 2^32" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; - - /* "rotate x_0jklm upwards by 11 bits" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); - - /* "swap x_0j0lm with x_0j1lm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jkl0 with x_1jkl1" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) - SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); } -} + #pragma unroll + for (i = 0; i<8; i++) { + state[i + 24] = chainv[i]; + chainv[i] = state[i + 32]; + } -__device__ __forceinline__ -static void block_tox(uint32_t *in, uint32_t x[2][2][2][2][2]) -{ - int k; - int l; - int m; -// uint32_t *in = block; - -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][0][k][l][m] ^= *in++; + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); + } + + #pragma unroll + for (i = 0; i<8; i++) + state[i + 32] = chainv[i]; } +/***************************************************/ __device__ __forceinline__ -static void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) +static void rnd512_nullhash(uint32_t *state) { - int j; - int k; - int l; - int m; -// uint32_t *out = hash; - -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - *out++ = x[0][j][k][l][m]; -} + uint32_t t[40]; + uint32_t chainv[8]; + uint32_t tmp; + int i, j; + + #pragma unroll + for (i = 0; i<8; i++) { + t[i] = state[i + 8 * 0]; + #pragma unroll 4 + for (j = 1; j<5; j++) + t[i] ^= state[i + 8 * j]; + } -void __device__ __forceinline__ Init(uint32_t x[2][2][2][2][2]) -{ - int i, j, k, l, m; -#if 0 - /* "the first three state words x_00000, x_00001, x_00010" */ - /* "are set to the integers h/8, b, r respectively." */ - /* "the remaining state words are set to 0." */ -#pragma unroll 2 - for (i = 0; i < 2; ++i) -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[i][j][k][l][m] = 0; - x[0][0][0][0][0] = 512 / 8; - x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES; - x[0][0][0][1][0] = CUBEHASH_ROUNDS; - - /* "the state is then transformed invertibly through 10r identical rounds */ - for (i = 0; i < 10; ++i) rrounds(x); -#else - const uint32_t *iv = c_IV_512; - -#pragma unroll 2 - for (i = 0; i < 2; ++i) -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[i][j][k][l][m] = *iv++; -#endif + MULT0(t); + + #pragma unroll + for (j = 0; j<5; j++) { + #pragma unroll + for (i = 0; i<8; i++) + state[i + 8 * j] ^= t[i]; + } + + #pragma unroll + for (j = 0; j<5; j++) { + #pragma unroll + for (i = 0; i<8; i++) + t[i + 8 * j] = state[i + 8 * j]; + } + + MULT0(state); + #pragma unroll 4 + for(j=1; j<5; j++) { + MULT2(state, j); + } + + #pragma unroll + for (j = 0; j<5; j++) { + #pragma unroll + for (i = 0; i<8; i++) + state[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; + } + + #pragma unroll + for (j = 0; j<5; j++) { + #pragma unroll 8 + for (i = 0; i<8; i++) + t[i + 8 * j] = state[i + 8 * j]; + } + + MULT0(state); + #pragma unroll 4 + for(j=1; j<5; j++) { + MULT2(state, j); + } + + #pragma unroll + for (j = 0; j<5; j++) { + #pragma unroll + for (i = 0; i<8; i++) + state[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; + } + + #pragma unroll + for (i = 0; i<8; i++) + chainv[i] = state[i]; + + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i)], c_CNS[(2 * i) + 1]); + } + + #pragma unroll + for (i = 0; i<8; i++) { + state[i] = chainv[i]; + chainv[i] = state[i + 8]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 16], c_CNS[(2 * i) + 16 + 1]); + } + + #pragma unroll + for (i = 0; i<8; i++) { + state[i + 8] = chainv[i]; + chainv[i] = state[i + 16]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 32], c_CNS[(2 * i) + 32 + 1]); + } + + #pragma unroll + for (i = 0; i<8; i++) { + state[i + 16] = chainv[i]; + chainv[i] = state[i + 24]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); + + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 48], c_CNS[(2 * i) + 48 + 1]); + } + + #pragma unroll + for (i = 0; i<8; i++) { + state[i + 24] = chainv[i]; + chainv[i] = state[i + 32]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + + #pragma unroll + for (i = 0; i<8; i++) { + STEP(c_CNS[(2 * i) + 64], c_CNS[(2 * i) + 64 + 1]); + } + + #pragma unroll + for (i = 0; i<8; i++) { + state[i + 32] = chainv[i]; + } } __device__ __forceinline__ -static void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) +static void Update512(uint32_t *statebuffer, uint32_t *statechainv, const uint32_t *data) { - /* "xor the block into the first b bytes of the state" */ - /* "and then transform the state invertibly through r identical rounds" */ - block_tox((uint32_t*)data, x); - rrounds(x); + #pragma unroll + for (int i = 0; i < 8; i++) statebuffer[i] = cuda_swab32(data[i]); + rnd512_first(statechainv, statebuffer); + + #pragma unroll + for (int i = 0; i < 8; i++) statebuffer[i] = cuda_swab32(data[i + 8]); + rnd512(statebuffer, statechainv); } +/***************************************************/ __device__ __forceinline__ -static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) +static void finalization512(uint32_t *statebuffer, uint32_t *statechainv, uint32_t *b) { - int i; + int i,j; + + statebuffer[0] = 0x80000000; + #pragma unroll 7 + for(int i=1;i<8;i++) statebuffer[i] = 0; + rnd512(statebuffer, statechainv); + + /*---- blank round with m=0 ----*/ + rnd512_nullhash(statechainv); - /* "the integer 1 is xored into the last state word x_11111" */ - x[1][1][1][1][1] ^= 1; + #pragma unroll + for(i=0;i<8;i++) { + b[i] = statechainv[i]; + #pragma unroll 4 + for(j=1;j<5;j++) { + b[i] ^= statechainv[i+8*j]; + } + b[i] = cuda_swab32((b[i])); + } + + rnd512_nullhash(statechainv); + + #pragma unroll + for(i=0;i<8;i++) { + b[8 + i] = statechainv[i]; + #pragma unroll 4 + for(j=1;j<5;j++) { + b[8+i] ^= statechainv[i+8*j]; + } + b[8 + i] = cuda_swab32((b[8 + i])); + } +} - /* "the state is then transformed invertibly through 10r identical rounds" */ - #pragma unroll 10 - for (i = 0; i < 10; ++i) rrounds(x); +#define ROUND_EVEN { \ + xg = (x0 + xg); \ + x0 = ROTL32(x0, 7); \ + xh = (x1 + xh); \ + x1 = ROTL32(x1, 7); \ + xi = (x2 + xi); \ + x2 = ROTL32(x2, 7); \ + xj = (x3 + xj); \ + x3 = ROTL32(x3, 7); \ + xk = (x4 + xk); \ + x4 = ROTL32(x4, 7); \ + xl = (x5 + xl); \ + x5 = ROTL32(x5, 7); \ + xm = (x6 + xm); \ + x6 = ROTL32(x6, 7); \ + xn = (x7 + xn); \ + x7 = ROTL32(x7, 7); \ + xo = (x8 + xo); \ + x8 = ROTL32(x8, 7); \ + xp = (x9 + xp); \ + x9 = ROTL32(x9, 7); \ + xq = (xa + xq); \ + xa = ROTL32(xa, 7); \ + xr = (xb + xr); \ + xb = ROTL32(xb, 7); \ + xs = (xc + xs); \ + xc = ROTL32(xc, 7); \ + xt = (xd + xt); \ + xd = ROTL32(xd, 7); \ + xu = (xe + xu); \ + xe = ROTL32(xe, 7); \ + xv = (xf + xv); \ + xf = ROTL32(xf, 7); \ + x8 ^= xg; \ + x9 ^= xh; \ + xa ^= xi; \ + xb ^= xj; \ + xc ^= xk; \ + xd ^= xl; \ + xe ^= xm; \ + xf ^= xn; \ + x0 ^= xo; \ + x1 ^= xp; \ + x2 ^= xq; \ + x3 ^= xr; \ + x4 ^= xs; \ + x5 ^= xt; \ + x6 ^= xu; \ + x7 ^= xv; \ + xi = (x8 + xi); \ + x8 = ROTL32(x8, 11); \ + xj = (x9 + xj); \ + x9 = ROTL32(x9, 11); \ + xg = (xa + xg); \ + xa = ROTL32(xa, 11); \ + xh = (xb + xh); \ + xb = ROTL32(xb, 11); \ + xm = (xc + xm); \ + xc = ROTL32(xc, 11); \ + xn = (xd + xn); \ + xd = ROTL32(xd, 11); \ + xk = (xe + xk); \ + xe = ROTL32(xe, 11); \ + xl = (xf + xl); \ + xf = ROTL32(xf, 11); \ + xq = (x0 + xq); \ + x0 = ROTL32(x0, 11); \ + xr = (x1 + xr); \ + x1 = ROTL32(x1, 11); \ + xo = (x2 + xo); \ + x2 = ROTL32(x2, 11); \ + xp = (x3 + xp); \ + x3 = ROTL32(x3, 11); \ + xu = (x4 + xu); \ + x4 = ROTL32(x4, 11); \ + xv = (x5 + xv); \ + x5 = ROTL32(x5, 11); \ + xs = (x6 + xs); \ + x6 = ROTL32(x6, 11); \ + xt = (x7 + xt); \ + x7 = ROTL32(x7, 11); \ + xc ^= xi; \ + xd ^= xj; \ + xe ^= xg; \ + xf ^= xh; \ + x8 ^= xm; \ + x9 ^= xn; \ + xa ^= xk; \ + xb ^= xl; \ + x4 ^= xq; \ + x5 ^= xr; \ + x6 ^= xo; \ + x7 ^= xp; \ + x0 ^= xu; \ + x1 ^= xv; \ + x2 ^= xs; \ + x3 ^= xt; \ +} - /* "output the first h/8 bytes of the state" */ - hash_fromx((uint32_t*)hashval, x); +#define ROUND_ODD { \ + xj = (xc + xj); \ + xc = ROTL32(xc, 7); \ + xi = (xd + xi); \ + xd = ROTL32(xd, 7); \ + xh = (xe + xh); \ + xe = ROTL32(xe, 7); \ + xg = (xf + xg); \ + xf = ROTL32(xf, 7); \ + xn = (x8 + xn); \ + x8 = ROTL32(x8, 7); \ + xm = (x9 + xm); \ + x9 = ROTL32(x9, 7); \ + xl = (xa + xl); \ + xa = ROTL32(xa, 7); \ + xk = (xb + xk); \ + xb = ROTL32(xb, 7); \ + xr = (x4 + xr); \ + x4 = ROTL32(x4, 7); \ + xq = (x5 + xq); \ + x5 = ROTL32(x5, 7); \ + xp = (x6 + xp); \ + x6 = ROTL32(x6, 7); \ + xo = (x7 + xo); \ + x7 = ROTL32(x7, 7); \ + xv = (x0 + xv); \ + x0 = ROTL32(x0, 7); \ + xu = (x1 + xu); \ + x1 = ROTL32(x1, 7); \ + xt = (x2 + xt); \ + x2 = ROTL32(x2, 7); \ + xs = (x3 + xs); \ + x3 = ROTL32(x3, 7); \ + x4 ^= xj; \ + x5 ^= xi; \ + x6 ^= xh; \ + x7 ^= xg; \ + x0 ^= xn; \ + x1 ^= xm; \ + x2 ^= xl; \ + x3 ^= xk; \ + xc ^= xr; \ + xd ^= xq; \ + xe ^= xp; \ + xf ^= xo; \ + x8 ^= xv; \ + x9 ^= xu; \ + xa ^= xt; \ + xb ^= xs; \ + xh = (x4 + xh); \ + x4 = ROTL32(x4, 11); \ + xg = (x5 + xg); \ + x5 = ROTL32(x5, 11); \ + xj = (x6 + xj); \ + x6 = ROTL32(x6, 11); \ + xi = (x7 + xi); \ + x7 = ROTL32(x7, 11); \ + xl = (x0 + xl); \ + x0 = ROTL32(x0, 11); \ + xk = (x1 + xk); \ + x1 = ROTL32(x1, 11); \ + xn = (x2 + xn); \ + x2 = ROTL32(x2, 11); \ + xm = (x3 + xm); \ + x3 = ROTL32(x3, 11); \ + xp = (xc + xp); \ + xc = ROTL32(xc, 11); \ + xo = (xd + xo); \ + xd = ROTL32(xd, 11); \ + xr = (xe + xr); \ + xe = ROTL32(xe, 11); \ + xq = (xf + xq); \ + xf = ROTL32(xf, 11); \ + xt = (x8 + xt); \ + x8 = ROTL32(x8, 11); \ + xs = (x9 + xs); \ + x9 = ROTL32(x9, 11); \ + xv = (xa + xv); \ + xa = ROTL32(xa, 11); \ + xu = (xb + xu); \ + xb = ROTL32(xb, 11); \ + x0 ^= xh; \ + x1 ^= xg; \ + x2 ^= xj; \ + x3 ^= xi; \ + x4 ^= xl; \ + x5 ^= xk; \ + x6 ^= xn; \ + x7 ^= xm; \ + x8 ^= xp; \ + x9 ^= xo; \ + xa ^= xr; \ + xb ^= xq; \ + xc ^= xt; \ + xd ^= xs; \ + xe ^= xv; \ + xf ^= xu; \ } +#define SIXTEEN_ROUNDS \ + for (int j = 0; j < 8; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + } -/***************************************************/ -// Hash Function __global__ -void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x11_luffaCubehash512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; - - hashState state; -#pragma unroll 40 - for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; -#pragma unroll 8 - for(int i=0;i<8;i++) state.buffer[i] = 0; - Update512(&state, (BitSequence*)Hash); - finalization512(&state, (uint32_t*)Hash); - //Cubehash + uint32_t statechainv[40] = { + 0x8bb0a761, 0xc2e4aa8b, 0x2d539bc9, 0x381408f8, + 0x478f6633, 0x255a46ff, 0x581c37f7, 0x601c2e8e, + 0x266c5f9d, 0xc34715d8, 0x8900670e, 0x51a540be, + 0xe4ce69fb, 0x5089f4d4, 0x3cc0a506, 0x609bcb02, + 0xa4e3cd82, 0xd24fd6ca, 0xc0f196dc, 0xcf41eafe, + 0x0ff2e673, 0x303804f2, 0xa7b3cd48, 0x677addd4, + 0x66e66a8a, 0x2303208f, 0x486dafb4, 0xc0d37dc6, + 0x634d15af, 0xe5af6747, 0x10af7e38, 0xee7e6428, + 0x01262e5d, 0xc92c2e64, 0x82fee966, 0xcea738d3, + 0x867de2b0, 0xe0714818, 0xda6e831f, 0xa7062529 + }; + + uint32_t statebuffer[8]; + uint32_t *const Hash = &g_hash[thread * 16U]; + + Update512(statebuffer, statechainv, Hash); + finalization512(statebuffer, statechainv, Hash); - uint32_t x[2][2][2][2][2]; - Init(x); - // erste Hälfte des Hashes (32 bytes) - Update32(x, (const BitSequence*)Hash); - // zweite Hälfte des Hashes (32 bytes) - Update32(x, (const BitSequence*)(Hash + 8)); - // Padding Block - uint32_t last[8]; - last[0] = 0x80; -#pragma unroll 7 - for (int i = 1; i < 8; i++) last[i] = 0; - Update32(x, (const BitSequence*)last); - Final(x, (BitSequence*)Hash); - } -} + //Cubehash + uint32_t x0 = 0x2AEA2A61, x1 = 0x50F494D4, x2 = 0x2D538B8B, x3 = 0x4167D83E; + uint32_t x4 = 0x3FEE2313, x5 = 0xC701CF8C, x6 = 0xCC39968E, x7 = 0x50AC5695; + uint32_t x8 = 0x4D42C787, x9 = 0xA647A8B3, xa = 0x97CF0BEF, xb = 0x825B4537; + uint32_t xc = 0xEEF864D2, xd = 0xF22090C4, xe = 0xD0E5CD33, xf = 0xA23911AE; + uint32_t xg = 0xFCD398D9, xh = 0x148FE485, xi = 0x1B017BEF, xj = 0xB6444532; + uint32_t xk = 0x6A536159, xl = 0x2FF5781C, xm = 0x91FA7934, xn = 0x0DBADEA9; + uint32_t xo = 0xD65C8A2B, xp = 0xA5A70E75, xq = 0xB1C62456, xr = 0xBC796576; + uint32_t xs = 0x1921C8F7, xt = 0xE7989AF1, xu = 0x7795D246, xv = 0xD43E3B44; + + x0 ^= Hash[0]; + x1 ^= Hash[1]; + x2 ^= Hash[2]; + x3 ^= Hash[3]; + x4 ^= Hash[4]; + x5 ^= Hash[5]; + x6 ^= Hash[6]; + x7 ^= Hash[7]; + + SIXTEEN_ROUNDS; + + x0 ^= Hash[8]; + x1 ^= Hash[9]; + x2 ^= Hash[10]; + x3 ^= Hash[11]; + x4 ^= Hash[12]; + x5 ^= Hash[13]; + x6 ^= Hash[14]; + x7 ^= Hash[15]; + + SIXTEEN_ROUNDS; + x0 ^= 0x80; + + SIXTEEN_ROUNDS; + xv ^= 1; + + for (int i = 3; i < 13; i++) { + SIXTEEN_ROUNDS; + } -// Setup -__host__ -void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads) -{ - cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice); + Hash[0] = x0; + Hash[1] = x1; + Hash[2] = x2; + Hash[3] = x3; + Hash[4] = x4; + Hash[5] = x5; + Hash[6] = x6; + Hash[7] = x7; + Hash[8] = x8; + Hash[9] = x9; + Hash[10] = xa; + Hash[11] = xb; + Hash[12] = xc; + Hash[13] = xd; + Hash[14] = xe; + Hash[15] = xf; + } } __host__ -void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x11_luffaCubehash512_gpu_hash_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + x11_luffaCubehash512_gpu_hash_64 <<>> (threads, d_hash); MyStreamSynchronize(NULL, order, thr_id); } +// Setup +__host__ +void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads) {} diff --git a/x11/x11.cu b/x11/x11.cu index cfef513..e24ec70 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -43,7 +43,7 @@ extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -201,7 +201,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, TRACE("jh512 :"); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("keccak :"); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); TRACE("luffa+c:"); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("shavite:"); diff --git a/x13/x13.cu b/x13/x13.cu index 40d107f..75e69aa 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -45,7 +45,7 @@ extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -200,7 +200,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata, quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x15/x14.cu b/x15/x14.cu index 02a5b2d..5d1a054 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -49,7 +49,7 @@ extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -208,7 +208,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x15/x15.cu b/x15/x15.cu index d3a6c78..5fe394f 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -50,7 +50,7 @@ extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -219,7 +219,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x17/x17.cu b/x17/x17.cu index 6d7e071..12b3999 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -53,7 +53,7 @@ extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_luffaCubehash512_cpu_init(int thr_id, uint32_t threads); -extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_luffaCubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -243,7 +243,7 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata, quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);