From 94c9945fe678246af1509d26f3eff9517bb7f95b Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 22 Nov 2014 21:38:16 +0100 Subject: [PATCH] cubeluffa: Fix indent and add some static prefixes use git "show -w " to see changes Duplicated functions in merged Cube+Luffa could be cross linked without --- x11/cuda_x11_echo.cu | 14 +- x11/cuda_x11_luffa512_Cubehash.cu | 470 +++++++++++++++--------------- 2 files changed, 243 insertions(+), 241 deletions(-) diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index c82e4da..cd8a83b 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -61,8 +61,8 @@ void cuda_echo_round( k0 = 512 + 8; - #pragma unroll - for (int idx = 0; idx < 16; idx+= 4) + #pragma unroll 4 + for (int idx = 0; idx < 16; idx += 4) { AES_2ROUND(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); @@ -125,7 +125,7 @@ void cuda_echo_round( c = P[24 + i + 4]; d = P[24 + i + 8]; - ab = a ^ b; + ab = a ^ b; bc = b ^ c; cd = c ^ d; @@ -144,7 +144,7 @@ void cuda_echo_round( W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; a = P[36 + i ]; - b = P[36 + i +4 ]; + b = P[36 + i + 4]; c = P[36 + i + 8]; d = h[i + 12]; @@ -221,7 +221,7 @@ void cuda_echo_round( #pragma unroll 4 for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t { - #pragma unroll 64 + #pragma unroll 4 for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte { uint32_t a = W[idx + i]; @@ -251,7 +251,7 @@ void cuda_echo_round( } #pragma unroll - for (int i = 0; i<16; i += 4) + for (int i = 0; i < 16; i += 4) { W[i] ^= W[32 + i] ^ 512; W[i + 1] ^= W[32 + i + 1]; @@ -260,7 +260,7 @@ void cuda_echo_round( } #pragma unroll - for (int i = 0; i<16; i++) + for (int i = 0; i < 16; i++) hash[i] ^= W[i]; } diff --git a/x11/cuda_x11_luffa512_Cubehash.cu b/x11/cuda_x11_luffa512_Cubehash.cu index 9756cb7..09e6767 100644 --- a/x11/cuda_x11_luffa512_Cubehash.cu +++ b/x11/cuda_x11_luffa512_Cubehash.cu @@ -23,20 +23,20 @@ typedef unsigned char BitSequence; typedef struct { - uint32_t buffer[8]; /* Buffer to be hashed */ - uint32_t chainv[40]; /* Chaining values */ + uint32_t buffer[8]; /* Buffer to be hashed */ + uint32_t chainv[40]; /* Chaining values */ } hashState; #define MULT2(a,j)\ - tmp = a[7+(8*j)];\ - a[7+(8*j)] = a[6+(8*j)];\ - a[6+(8*j)] = a[5+(8*j)];\ - a[5+(8*j)] = a[4+(8*j)];\ - a[4+(8*j)] = a[3+(8*j)] ^ tmp;\ - a[3+(8*j)] = a[2+(8*j)] ^ tmp;\ - a[2+(8*j)] = a[1+(8*j)];\ - a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ - a[0+(8*j)] = tmp; + tmp = a[7+(8*j)];\ + a[7+(8*j)] = a[6+(8*j)];\ + a[6+(8*j)] = a[5+(8*j)];\ + a[5+(8*j)] = a[4+(8*j)];\ + a[4+(8*j)] = a[3+(8*j)] ^ tmp;\ + a[3+(8*j)] = a[2+(8*j)] ^ tmp;\ + a[2+(8*j)] = a[1+(8*j)];\ + a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ + a[0+(8*j)] = tmp; #if __CUDA_ARCH__ < 350 #define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) @@ -45,294 +45,294 @@ typedef struct { #endif #define TWEAK(a0,a1,a2,a3,j)\ - a0 = LROT(a0,j);\ - a1 = LROT(a1,j);\ - a2 = LROT(a2,j);\ - a3 = LROT(a3,j); + a0 = LROT(a0,j);\ + a1 = LROT(a1,j);\ + a2 = LROT(a2,j);\ + a3 = LROT(a3,j); #define STEP(c0,c1)\ - SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ - SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ - MIXWORD(chainv[0],chainv[4]);\ - MIXWORD(chainv[1],chainv[5]);\ - MIXWORD(chainv[2],chainv[6]);\ - MIXWORD(chainv[3],chainv[7]);\ - ADD_CONSTANT(chainv[0],chainv[4],c0,c1); + SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ + SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ + MIXWORD(chainv[0],chainv[4]);\ + MIXWORD(chainv[1],chainv[5]);\ + MIXWORD(chainv[2],chainv[6]);\ + MIXWORD(chainv[3],chainv[7]);\ + ADD_CONSTANT(chainv[0],chainv[4],c0,c1); #define SUBCRUMB(a0,a1,a2,a3,a4)\ - a4 = a0;\ - a0 |= a1;\ - a2 ^= a3;\ - a1 = ~a1;\ - a0 ^= a3;\ - a3 &= a4;\ - a1 ^= a3;\ - a3 ^= a2;\ - a2 &= a0;\ - a0 = ~a0;\ - a2 ^= a1;\ - a1 |= a3;\ - a4 ^= a1;\ - a3 ^= a2;\ - a2 &= a1;\ - a1 ^= a0;\ - a0 = a4; + a4 = a0;\ + a0 |= a1;\ + a2 ^= a3;\ + a1 = ~a1;\ + a0 ^= a3;\ + a3 &= a4;\ + a1 ^= a3;\ + a3 ^= a2;\ + a2 &= a0;\ + a0 = ~a0;\ + a2 ^= a1;\ + a1 |= a3;\ + a4 ^= a1;\ + a3 ^= a2;\ + a2 &= a1;\ + a1 ^= a0;\ + a0 = a4; #define MIXWORD(a0,a4)\ - a4 ^= a0;\ - a0 = LROT(a0,2);\ - a0 ^= a4;\ - a4 = LROT(a4,14);\ - a4 ^= a0;\ - a0 = LROT(a0,10);\ - a0 ^= a4;\ - a4 = LROT(a4,1); + a4 ^= a0;\ + a0 = LROT(a0,2);\ + a0 ^= a4;\ + a4 = LROT(a4,14);\ + a4 ^= a0;\ + a0 = LROT(a0,10);\ + a0 ^= a4;\ + a4 = LROT(a4,1); #define ADD_CONSTANT(a0,b0,c0,c1)\ - a0 ^= c0;\ - b0 ^= c1; + a0 ^= c0;\ + b0 ^= c1; /* initial values of chaining variables */ -__device__ __constant__ uint32_t c_IV[40]; -const uint32_t h_IV[40] = { - 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, - 0x6e292011,0x90152df4,0xee058139,0xdef610bb, - 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, - 0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581, - 0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05, - 0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7, - 0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67, - 0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce, - 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, - 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; - -__device__ __constant__ uint32_t c_CNS[80]; -const uint32_t h_CNS[80] = { - 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, - 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, - 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, - 0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d, - 0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4, - 0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28, - 0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b, - 0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704, - 0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72, - 0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7, - 0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719, - 0xd9847356,0x36eda57f,0xa2c78434,0x703aace7, - 0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91, - 0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be, - 0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5, - 0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355, - 0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab, - 0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0, - 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3, - 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31}; +__device__ static __constant__ uint32_t c_IV[40]; +static const uint32_t h_IV[40] = { + 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, + 0x6e292011,0x90152df4,0xee058139,0xdef610bb, + 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, + 0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581, + 0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05, + 0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7, + 0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67, + 0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce, + 0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363, + 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; + +__device__ static __constant__ uint32_t c_CNS[80]; +static const uint32_t h_CNS[80] = { + 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, + 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, + 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, + 0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d, + 0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4, + 0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28, + 0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b, + 0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704, + 0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72, + 0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7, + 0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719, + 0xd9847356,0x36eda57f,0xa2c78434,0x703aace7, + 0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91, + 0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be, + 0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5, + 0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355, + 0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab, + 0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0, + 0x78602649,0x29131ab6,0x8edae952,0x0fc053c3, + 0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31}; /***************************************************/ __device__ __forceinline__ -void rnd512(hashState *state) +static void rnd512(hashState *state) { - int i,j; - uint32_t t[40]; - uint32_t chainv[8]; - uint32_t tmp; + int i,j; + uint32_t t[40]; + uint32_t chainv[8]; + uint32_t tmp; #pragma unroll 8 - for(i=0;i<8;i++) { - t[i]=0; + for(i=0;i<8;i++) { + t[i]=0; #pragma unroll 5 - for(j=0;j<5;j++) { - t[i] ^= state->chainv[i+8*j]; - } - } + for(j=0;j<5;j++) { + t[i] ^= state->chainv[i+8*j]; + } + } - MULT2(t, 0); + MULT2(t, 0); #pragma unroll 5 - for(j=0;j<5;j++) { + for(j=0;j<5;j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= t[i]; - } - } + for(i=0;i<8;i++) { + state->chainv[i+8*j] ^= t[i]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j=0;j<5;j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; - } - } + for(i=0;i<8;i++) { + t[i+8*j] = state->chainv[i+8*j]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); - } + for(j=0;j<5;j++) { + MULT2(state->chainv, j); + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j=0;j<5;j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; - } - } + for(i=0;i<8;i++) { + state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j=0;j<5;j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; - } - } + for(i=0;i<8;i++) { + t[i+8*j] = state->chainv[i+8*j]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); - } + for(j=0;j<5;j++) { + MULT2(state->chainv, j); + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j=0;j<5;j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; - } - } + for(i=0;i<8;i++) { + state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; + } + } #pragma unroll 5 - for(j=0;j<5;j++) { + for(j=0;j<5;j++) { #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= state->buffer[i]; - } - MULT2(state->buffer, 0); - } + for(i=0;i<8;i++) { + state->chainv[i+8*j] ^= state->buffer[i]; + } + MULT2(state->buffer, 0); + } #pragma unroll 8 - for(i=0;i<8;i++) { - chainv[i] = state->chainv[i]; - } + for(i=0;i<8;i++) { + chainv[i] = state->chainv[i]; + } #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); - } + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i] = chainv[i]; - chainv[i] = state->chainv[i+8]; - } + for(i=0;i<8;i++) { + state->chainv[i] = chainv[i]; + chainv[i] = state->chainv[i+8]; + } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); + TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); - } + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8] = chainv[i]; - chainv[i] = state->chainv[i+16]; - } + for(i=0;i<8;i++) { + state->chainv[i+8] = chainv[i]; + chainv[i] = state->chainv[i+16]; + } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); + TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); - } + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+16] = chainv[i]; - chainv[i] = state->chainv[i+24]; - } + for(i=0;i<8;i++) { + state->chainv[i+16] = chainv[i]; + chainv[i] = state->chainv[i+24]; + } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); + TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); - } + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+24] = chainv[i]; - chainv[i] = state->chainv[i+32]; - } + for(i=0;i<8;i++) { + state->chainv[i+24] = chainv[i]; + chainv[i] = state->chainv[i+32]; + } - TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); + TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); #pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); - } + for(i=0;i<8;i++) { + STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); + } #pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+32] = chainv[i]; - } + for(i=0;i<8;i++) { + state->chainv[i+32] = chainv[i]; + } } __device__ __forceinline__ -void Update512(hashState *state, const BitSequence *data) +static void Update512(hashState *state, const BitSequence *data) { #pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]); - rnd512(state); + for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]); + rnd512(state); #pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]); - rnd512(state); + for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]); + rnd512(state); } /***************************************************/ __device__ __forceinline__ -void finalization512(hashState *state, uint32_t *b) +static void finalization512(hashState *state, uint32_t *b) { - int i,j; + int i,j; - state->buffer[0] = 0x80000000; + state->buffer[0] = 0x80000000; #pragma unroll 7 - for(int i=1;i<8;i++) state->buffer[i] = 0; - rnd512(state); + for(int i=1;i<8;i++) state->buffer[i] = 0; + rnd512(state); - /*---- blank round with m=0 ----*/ + /*---- blank round with m=0 ----*/ #pragma unroll 8 - for(i=0;i<8;i++) state->buffer[i] =0; - rnd512(state); + for(i=0;i<8;i++) state->buffer[i] =0; + rnd512(state); #pragma unroll 8 - for(i=0;i<8;i++) { - b[i] = 0; + for(i=0;i<8;i++) { + b[i] = 0; #pragma unroll 5 - for(j=0;j<5;j++) { - b[i] ^= state->chainv[i+8*j]; - } - b[i] = cuda_swab32((b[i])); - } + for(j=0;j<5;j++) { + b[i] ^= state->chainv[i+8*j]; + } + b[i] = cuda_swab32((b[i])); + } #pragma unroll 8 - for(i=0;i<8;i++) state->buffer[i]=0; - rnd512(state); + for(i=0;i<8;i++) state->buffer[i]=0; + rnd512(state); #pragma unroll 8 - for(i=0;i<8;i++) { - b[8+i] = 0; + for(i=0;i<8;i++) { + b[8+i] = 0; #pragma unroll 5 - for(j=0;j<5;j++) { - b[8+i] ^= state->chainv[i+8*j]; - } - b[8 + i] = cuda_swab32((b[8 + i])); - } + for(j=0;j<5;j++) { + b[8+i] ^= state->chainv[i+8*j]; + } + b[8 + i] = cuda_swab32((b[8 + i])); + } } -typedef unsigned char BitSequence; +//typedef unsigned char BitSequence; #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ @@ -480,7 +480,8 @@ static __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2]) } -static __device__ __forceinline__ void block_tox(uint32_t *in, uint32_t x[2][2][2][2][2]) +__device__ __forceinline__ +static void block_tox(uint32_t *in, uint32_t x[2][2][2][2][2]) { int k; int l; @@ -496,7 +497,8 @@ static __device__ __forceinline__ void block_tox(uint32_t *in, uint32_t x[2][2][ x[0][0][k][l][m] ^= *in++; } -static __device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) +__device__ __forceinline__ +static void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) { int j; int k; @@ -556,7 +558,8 @@ void __device__ __forceinline__ Init(uint32_t x[2][2][2][2][2]) #endif } -void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) +__device__ __forceinline__ +static void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) { /* "xor the block into the first b bytes of the state" */ /* "and then transform the state invertibly through r identical rounds" */ @@ -564,7 +567,8 @@ void __device__ __forceinline__ Update32(uint32_t x[2][2][2][2][2], const BitSeq rrounds(x); } -void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) +__device__ __forceinline__ +static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) { int i; @@ -581,25 +585,25 @@ void __device__ __forceinline__ Final(uint32_t x[2][2][2][2][2], BitSequence *ha /***************************************************/ -// Die Hash-Funktion +// Hash Function __global__ void x11_luffaCubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; + int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; - hashState state; + hashState state; #pragma unroll 40 - for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; + for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; #pragma unroll 8 - for(int i=0;i<8;i++) state.buffer[i] = 0; - Update512(&state, (BitSequence*)Hash); - finalization512(&state, (uint32_t*)Hash); + for(int i=0;i<8;i++) state.buffer[i] = 0; + Update512(&state, (BitSequence*)Hash); + finalization512(&state, (uint32_t*)Hash); //Cubehash uint32_t x[2][2][2][2][2]; @@ -619,25 +623,23 @@ void x11_luffaCubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_ } -// Setup-Funktionen -__host__ void x11_luffaCubehash512_cpu_init(int thr_id, int threads) +// Setup +__host__ +void x11_luffaCubehash512_cpu_init(int thr_id, int threads) { - cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice); } -__host__ void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ +void x11_luffaCubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - const int threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + const int threadsperblock = 256; - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - x11_luffaCubehash512_gpu_hash_64 << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); + x11_luffaCubehash512_gpu_hash_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + MyStreamSynchronize(NULL, order, thr_id); }