Browse Source

cubeluffa: Fix indent and add some static prefixes

use git "show -w <commithash>" to see changes

Duplicated functions in merged Cube+Luffa could be cross linked without
2upstream
Tanguy Pruvot 10 years ago
parent
commit
94c9945fe6
  1. 14
      x11/cuda_x11_echo.cu
  2. 470
      x11/cuda_x11_luffa512_Cubehash.cu

14
x11/cuda_x11_echo.cu

@ -61,8 +61,8 @@ void cuda_echo_round(
k0 = 512 + 8; k0 = 512 + 8;
#pragma unroll #pragma unroll 4
for (int idx = 0; idx < 16; idx+= 4) for (int idx = 0; idx < 16; idx += 4)
{ {
AES_2ROUND(sharedMemory, AES_2ROUND(sharedMemory,
h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
@ -125,7 +125,7 @@ void cuda_echo_round(
c = P[24 + i + 4]; c = P[24 + i + 4];
d = P[24 + i + 8]; d = P[24 + i + 8];
ab = a ^ b; ab = a ^ b;
bc = b ^ c; bc = b ^ c;
cd = c ^ d; cd = c ^ d;
@ -144,7 +144,7 @@ void cuda_echo_round(
W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c;
a = P[36 + i ]; a = P[36 + i ];
b = P[36 + i +4 ]; b = P[36 + i + 4];
c = P[36 + i + 8]; c = P[36 + i + 8];
d = h[i + 12]; d = h[i + 12];
@ -221,7 +221,7 @@ void cuda_echo_round(
#pragma unroll 4 #pragma unroll 4
for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t
{ {
#pragma unroll 64 #pragma unroll 4
for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte
{ {
uint32_t a = W[idx + i]; uint32_t a = W[idx + i];
@ -251,7 +251,7 @@ void cuda_echo_round(
} }
#pragma unroll #pragma unroll
for (int i = 0; i<16; i += 4) for (int i = 0; i < 16; i += 4)
{ {
W[i] ^= W[32 + i] ^ 512; W[i] ^= W[32 + i] ^ 512;
W[i + 1] ^= W[32 + i + 1]; W[i + 1] ^= W[32 + i + 1];
@ -260,7 +260,7 @@ void cuda_echo_round(
} }
#pragma unroll #pragma unroll
for (int i = 0; i<16; i++) for (int i = 0; i < 16; i++)
hash[i] ^= W[i]; hash[i] ^= W[i];
} }

470
x11/cuda_x11_luffa512_Cubehash.cu

@ -23,20 +23,20 @@
typedef unsigned char BitSequence; typedef unsigned char BitSequence;
typedef struct { typedef struct {
uint32_t buffer[8]; /* Buffer to be hashed */ uint32_t buffer[8]; /* Buffer to be hashed */
uint32_t chainv[40]; /* Chaining values */ uint32_t chainv[40]; /* Chaining values */
} hashState; } hashState;
#define MULT2(a,j)\ #define MULT2(a,j)\
tmp = a[7+(8*j)];\ tmp = a[7+(8*j)];\
a[7+(8*j)] = a[6+(8*j)];\ a[7+(8*j)] = a[6+(8*j)];\
a[6+(8*j)] = a[5+(8*j)];\ a[6+(8*j)] = a[5+(8*j)];\
a[5+(8*j)] = a[4+(8*j)];\ a[5+(8*j)] = a[4+(8*j)];\
a[4+(8*j)] = a[3+(8*j)] ^ tmp;\ a[4+(8*j)] = a[3+(8*j)] ^ tmp;\
a[3+(8*j)] = a[2+(8*j)] ^ tmp;\ a[3+(8*j)] = a[2+(8*j)] ^ tmp;\
a[2+(8*j)] = a[1+(8*j)];\ a[2+(8*j)] = a[1+(8*j)];\
a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
a[0+(8*j)] = tmp; a[0+(8*j)] = tmp;
#if __CUDA_ARCH__ < 350 #if __CUDA_ARCH__ < 350
#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) #define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
@ -45,294 +45,294 @@ typedef struct {
#endif #endif
#define TWEAK(a0,a1,a2,a3,j)\ #define TWEAK(a0,a1,a2,a3,j)\
a0 = LROT(a0,j);\ a0 = LROT(a0,j);\
a1 = LROT(a1,j);\ a1 = LROT(a1,j);\
a2 = LROT(a2,j);\ a2 = LROT(a2,j);\
a3 = LROT(a3,j); a3 = LROT(a3,j);
#define STEP(c0,c1)\ #define STEP(c0,c1)\
SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
MIXWORD(chainv[0],chainv[4]);\ MIXWORD(chainv[0],chainv[4]);\
MIXWORD(chainv[1],chainv[5]);\ MIXWORD(chainv[1],chainv[5]);\
MIXWORD(chainv[2],chainv[6]);\ MIXWORD(chainv[2],chainv[6]);\
MIXWORD(chainv[3],chainv[7]);\ MIXWORD(chainv[3],chainv[7]);\
ADD_CONSTANT(chainv[0],chainv[4],c0,c1); ADD_CONSTANT(chainv[0],chainv[4],c0,c1);
#define SUBCRUMB(a0,a1,a2,a3,a4)\ #define SUBCRUMB(a0,a1,a2,a3,a4)\
a4 = a0;\ a4 = a0;\
a0 |= a1;\ a0 |= a1;\
a2 ^= a3;\ a2 ^= a3;\
a1 = ~a1;\ a1 = ~a1;\
a0 ^= a3;\ a0 ^= a3;\
a3 &= a4;\ a3 &= a4;\
a1 ^= a3;\ a1 ^= a3;\
a3 ^= a2;\ a3 ^= a2;\
a2 &= a0;\ a2 &= a0;\
a0 = ~a0;\ a0 = ~a0;\
a2 ^= a1;\ a2 ^= a1;\
a1 |= a3;\ a1 |= a3;\
a4 ^= a1;\ a4 ^= a1;\
a3 ^= a2;\ a3 ^= a2;\
a2 &= a1;\ a2 &= a1;\
a1 ^= a0;\ a1 ^= a0;\
a0 = a4; a0 = a4;
#define MIXWORD(a0,a4)\ #define MIXWORD(a0,a4)\
a4 ^= a0;\ a4 ^= a0;\
a0 = LROT(a0,2);\ a0 = LROT(a0,2);\
a0 ^= a4;\ a0 ^= a4;\
a4 = LROT(a4,14);\ a4 = LROT(a4,14);\
a4 ^= a0;\ a4 ^= a0;\
a0 = LROT(a0,10);\ a0 = LROT(a0,10);\
a0 ^= a4;\ a0 ^= a4;\
a4 = LROT(a4,1); a4 = LROT(a4,1);
#define ADD_CONSTANT(a0,b0,c0,c1)\ #define ADD_CONSTANT(a0,b0,c0,c1)\
a0 ^= c0;\ a0 ^= c0;\
b0 ^= c1; b0 ^= c1;
/* initial values of chaining variables */ /* initial values of chaining variables */
__device__ __constant__ uint32_t c_IV[40]; __device__ static __constant__ uint32_t c_IV[40];
const uint32_t h_IV[40] = { static const uint32_t h_IV[40] = {
0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
0x6e292011,0x90152df4,0xee058139,0xdef610bb, 0x6e292011,0x90152df4,0xee058139,0xdef610bb,
0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581, 0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581,
0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05, 0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05,
0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7, 0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7,
0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67, 0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67,
0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce, 0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce,
0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
__device__ __constant__ uint32_t c_CNS[80]; __device__ static __constant__ uint32_t c_CNS[80];
const uint32_t h_CNS[80] = { static const uint32_t h_CNS[80] = {
0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d, 0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d,
0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4, 0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4,
0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28, 0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28,
0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b, 0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b,
0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704, 0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704,
0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72, 0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72,
0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7, 0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7,
0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719, 0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719,
0xd9847356,0x36eda57f,0xa2c78434,0x703aace7, 0xd9847356,0x36eda57f,0xa2c78434,0x703aace7,
0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91, 0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91,
0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be, 0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be,
0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5, 0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5,
0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355, 0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355,
0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab, 0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab,
0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0, 0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0,
0x78602649,0x29131ab6,0x8edae952,0x0fc053c3, 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31}; 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31};
/***************************************************/ /***************************************************/
__device__ __forceinline__ __device__ __forceinline__
void rnd512(hashState *state) static void rnd512(hashState *state)
{ {
int i,j; int i,j;
uint32_t t[40]; uint32_t t[40];
uint32_t chainv[8]; uint32_t chainv[8];
uint32_t tmp; uint32_t tmp;
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
t[i]=0; t[i]=0;
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
t[i] ^= state->chainv[i+8*j]; t[i] ^= state->chainv[i+8*j];
} }
} }
MULT2(t, 0); MULT2(t, 0);
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[i+8*j] ^= t[i]; state->chainv[i+8*j] ^= t[i];
} }
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
t[i+8*j] = state->chainv[i+8*j]; t[i+8*j] = state->chainv[i+8*j];
} }
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
MULT2(state->chainv, j); MULT2(state->chainv, j);
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; state->chainv[8*j+i] ^= t[8*((j+1)%5)+i];
} }
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
t[i+8*j] = state->chainv[i+8*j]; t[i+8*j] = state->chainv[i+8*j];
} }
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
MULT2(state->chainv, j); MULT2(state->chainv, j);
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; state->chainv[8*j+i] ^= t[8*((j+4)%5)+i];
} }
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[i+8*j] ^= state->buffer[i]; state->chainv[i+8*j] ^= state->buffer[i];
} }
MULT2(state->buffer, 0); MULT2(state->buffer, 0);
} }
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
chainv[i] = state->chainv[i]; chainv[i] = state->chainv[i];
} }
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]);
} }
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[i] = chainv[i]; state->chainv[i] = chainv[i];
chainv[i] = state->chainv[i+8]; chainv[i] = state->chainv[i+8];
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]);
} }
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[i+8] = chainv[i]; state->chainv[i+8] = chainv[i];
chainv[i] = state->chainv[i+16]; chainv[i] = state->chainv[i+16];
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
} }
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[i+16] = chainv[i]; state->chainv[i+16] = chainv[i];
chainv[i] = state->chainv[i+24]; chainv[i] = state->chainv[i+24];
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
} }
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[i+24] = chainv[i]; state->chainv[i+24] = chainv[i];
chainv[i] = state->chainv[i+32]; chainv[i] = state->chainv[i+32];
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
} }
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
state->chainv[i+32] = chainv[i]; state->chainv[i+32] = chainv[i];
} }
} }
__device__ __forceinline__ __device__ __forceinline__
void Update512(hashState *state, const BitSequence *data) static void Update512(hashState *state, const BitSequence *data)
{ {
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]); for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
rnd512(state); rnd512(state);
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]); for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
rnd512(state); rnd512(state);
} }
/***************************************************/ /***************************************************/
__device__ __forceinline__ __device__ __forceinline__
void finalization512(hashState *state, uint32_t *b) static void finalization512(hashState *state, uint32_t *b)
{ {
int i,j; int i,j;
state->buffer[0] = 0x80000000; state->buffer[0] = 0x80000000;
#pragma unroll 7 #pragma unroll 7
for(int i=1;i<8;i++) state->buffer[i] = 0; for(int i=1;i<8;i++) state->buffer[i] = 0;
rnd512(state); rnd512(state);
/*---- blank round with m=0 ----*/ /*---- blank round with m=0 ----*/
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) state->buffer[i] =0; for(i=0;i<8;i++) state->buffer[i] =0;
rnd512(state); rnd512(state);
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
b[i] = 0; b[i] = 0;
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
b[i] ^= state->chainv[i+8*j]; b[i] ^= state->chainv[i+8*j];
} }
b[i] = cuda_swab32((b[i])); b[i] = cuda_swab32((b[i]));
} }
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) state->buffer[i]=0; for(i=0;i<8;i++) state->buffer[i]=0;
rnd512(state); rnd512(state);
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
b[8+i] = 0; b[8+i] = 0;
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0;j<5;j++) {
b[8+i] ^= state->chainv[i+8*j]; b[8+i] ^= state->chainv[i+8*j];
} }
b[8 + i] = cuda_swab32((b[8 + i])); b[8 + i] = cuda_swab32((b[8 + i]));
} }
} }
typedef unsigned char BitSequence; //typedef unsigned char BitSequence;
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
@ -480,7 +480,8 @@ static __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
} }
static __device__ __forceinline__ void block_tox(uint32_t *in, uint32_t x[2][2][2][2][2]) __device__ __forceinline__
static void block_tox(uint32_t *in, uint32_t x[2][2][2][2][2])
{ {
int k; int k;
int l; int l;
@ -496,7 +497,8 @@ static __device__ __forceinline__ void block_tox(uint32_t *in, uint32_t x[2][2][
x[0][0][k][l][m] ^= *in++; x[0][0][k][l][m] ^= *in++;
} }
static __device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) __device__ __forceinline__
static void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2])
{ {
int j; int j;
int k; int k;
@ -556,7 +558,8 @@ void __device__ __forceinline__ Init(uint32_t x[2][2][2][2][2])
#endif #endif
} }
void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) __device__ __forceinline__
static void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
{ {
/* "xor the block into the first b bytes of the state" */ /* "xor the block into the first b bytes of the state" */
/* "and then transform the state invertibly through r identical rounds" */ /* "and then transform the state invertibly through r identical rounds" */
@ -564,7 +567,8 @@ void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const BitSeq
rrounds(x); rrounds(x);
} }
void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) __device__ __forceinline__
static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
{ {
int i; int i;
@ -581,25 +585,25 @@ void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], BitSequence *ha
/***************************************************/ /***************************************************/
// Die Hash-Funktion // Hash Function
__global__ __global__
void x11_luffaCubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) void x11_luffaCubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
hashState state; hashState state;
#pragma unroll 40 #pragma unroll 40
for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) state.buffer[i] = 0; for(int i=0;i<8;i++) state.buffer[i] = 0;
Update512(&state, (BitSequence*)Hash); Update512(&state, (BitSequence*)Hash);
finalization512(&state, (uint32_t*)Hash); finalization512(&state, (uint32_t*)Hash);
//Cubehash //Cubehash
uint32_t x[2][2][2][2][2]; uint32_t x[2][2][2][2][2];
@ -619,25 +623,23 @@ void x11_luffaCubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_
} }
// Setup-Funktionen // Setup
__host__ void x11_luffaCubehash512_cpu_init(int thr_id, int threads) __host__
void x11_luffaCubehash512_cpu_init(int thr_id, int threads)
{ {
cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice);
} }
__host__ void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__
void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs dim3 grid((threads + threadsperblock-1)/threadsperblock);
size_t shared_size = 0; dim3 block(threadsperblock);
x11_luffaCubehash512_gpu_hash_64 << <grid, block, shared_size >> >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_luffaCubehash512_gpu_hash_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

Loading…
Cancel
Save