From 3e419abf84e24935d049b781b55072d2f4f548be Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 10 Jun 2015 18:59:22 +0200 Subject: [PATCH] qubit: implement cpu precalc (klaust) improve qubit (+5%) deep and doom (+10%) hashrate based on klausT code, simplified... --- qubit/deep.cu | 29 +- qubit/doom.cu | 40 +-- qubit/qubit_luffa512.cu | 571 ++++++++++++++++++++-------------------- 3 files changed, 318 insertions(+), 322 deletions(-) diff --git a/qubit/deep.cu b/qubit/deep.cu index ce6720d..4041f40 100644 --- a/qubit/deep.cu +++ b/qubit/deep.cu @@ -19,8 +19,6 @@ static uint32_t *d_hash[MAX_GPUS]; extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); extern void qubit_luffa512_cpu_setBlock_80(void *pdata); extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget); -extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads); extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -30,13 +28,13 @@ extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t start extern "C" void deephash(void *state, const void *input) { - // luffa1-cubehash2-shavite3-simd4-echo5 + uint8_t _ALIGN(64) hash[64]; + + // luffa-80 cubehash-64 echo-64 sph_luffa512_context ctx_luffa; sph_cubehash512_context ctx_cubehash; sph_echo512_context ctx_echo; - uint8_t hash[64]; - sph_luffa512_init(&ctx_luffa); sph_luffa512 (&ctx_luffa, input, 80); sph_luffa512_close(&ctx_luffa, (void*) hash); @@ -54,14 +52,13 @@ extern "C" void deephash(void *state, const void *input) static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) { + uint32_t _ALIGN(64) endiandata[20]; const uint32_t first_nonce = pdata[19]; - uint32_t endiandata[20]; uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8 - throughput = min(throughput, (max_nonce - first_nonce)); + throughput = min(throughput, (max_nonce - first_nonce)); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000f; @@ -69,7 +66,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); - cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64)); qubit_luffa512_cpu_init(thr_id, throughput); x11_cubehash512_cpu_init(thr_id, throughput); @@ -80,10 +78,10 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, init[thr_id] = true; } - for (int k=0; k < 20; k++) + for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); - qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget); + qubit_luffa512_cpu_setBlock_80((void*)endiandata); cuda_check_cpu_setTarget(ptarget); do { @@ -96,12 +94,11 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { - const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; + uint32_t _ALIGN(64) vhash64[8]; be32enc(&endiandata[19], foundNonce); deephash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); *hashes_done = pdata[19] - first_nonce + throughput; diff --git a/qubit/doom.cu b/qubit/doom.cu index 1a8aa4e..73c8622 100644 --- a/qubit/doom.cu +++ b/qubit/doom.cu @@ -15,15 +15,12 @@ static uint32_t *d_hash[MAX_GPUS]; extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); extern void qubit_luffa512_cpu_setBlock_80(void *pdata); extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget); -extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); -extern void doomhash(void *state, const void *input) +extern "C" void doomhash(void *state, const void *input) { - // luffa512 - sph_luffa512_context ctx_luffa; + uint8_t _ALIGN(64) hash[64]; - uint8_t hash[64]; + sph_luffa512_context ctx_luffa; sph_luffa512_init(&ctx_luffa); sph_luffa512 (&ctx_luffa, input, 80); @@ -34,12 +31,11 @@ extern void doomhash(void *state, const void *input) static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, - const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done) +extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) { + uint32_t _ALIGN(64) endiandata[20]; const uint32_t first_nonce = pdata[19]; - uint32_t endiandata[20]; uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); // 256*256*8*8 throughput = min(throughput, max_nonce - first_nonce); @@ -50,31 +46,35 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, { cudaSetDevice(device_map[thr_id]); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64)); - qubit_luffa512_cpu_init(thr_id, (int) throughput); + qubit_luffa512_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; } - for (int k=0; k < 20; k++) + for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); - qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget); + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + cuda_check_cpu_setTarget(ptarget); do { int order = 0; + *hashes_done = pdata[19] - first_nonce + throughput; + + qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++); - uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++); + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { - const uint32_t Htarg = ptarget[7]; - uint32_t vhash64[8]; + uint32_t _ALIGN(64) vhash64[8]; be32enc(&endiandata[19], foundNonce); doomhash(vhash64, endiandata); - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { - *hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput); + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { + //*hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput); pdata[19] = foundNonce; return 1; } else { @@ -82,7 +82,7 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, } } - if ((uint64_t) pdata[19] + throughput > max_nonce) { + if ((uint64_t) throughput + pdata[19] > max_nonce) { // pdata[19] = max_nonce; break; } diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu index ec40564..d4fea4c 100644 --- a/qubit/qubit_luffa512.cu +++ b/qubit/qubit_luffa512.cu @@ -1,49 +1,18 @@ -/* - * luffa_for_32.c - * Version 2.0 (Sep 15th 2009) - * - * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. - * - * Hitachi, Ltd. is the owner of this software and hereby grant - * the U.S. Government and any interested party the right to use - * this software for the purposes of the SHA-3 evaluation process, - * notwithstanding that this software is copyrighted. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +/******************************************************************************* + * luffa512 for 80-bytes input (with midstate precalc by klausT) */ + #include #include #include #include "cuda_helper.h" -typedef unsigned char BitSequence; - -__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) -__constant__ uint32_t c_Target[8]; - -static uint32_t *h_resNounce[MAX_GPUS]; -static uint32_t *d_resNounce[MAX_GPUS]; - -#define NBN 1 /* max results, could be 2, see blake32.cu */ -#if NBN > 1 -static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX }; -#endif +static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) +static __constant__ uint32_t statebufferpre[8]; +static __constant__ uint32_t statechainvpre[40]; -typedef struct { - uint32_t buffer[8]; /* Buffer to be hashed */ - uint32_t chainv[40]; /* Chaining values */ -} hashState; - -#define BYTES_SWAP32(x) cuda_swab32(x) - -#define MULT2(a,j)\ +#define MULT2(a,j) {\ tmp = a[7+(8*j)];\ a[7+(8*j)] = a[6+(8*j)];\ a[6+(8*j)] = a[5+(8*j)];\ @@ -52,22 +21,25 @@ typedef struct { a[3+(8*j)] = a[2+(8*j)] ^ tmp;\ a[2+(8*j)] = a[1+(8*j)];\ a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ - a[0+(8*j)] = tmp; + a[0+(8*j)] = tmp;\ +} -#define TWEAK(a0,a1,a2,a3,j)\ +#define TWEAK(a0,a1,a2,a3,j) { \ a0 = (a0<<(j))|(a0>>(32-j));\ a1 = (a1<<(j))|(a1>>(32-j));\ a2 = (a2<<(j))|(a2>>(32-j));\ - a3 = (a3<<(j))|(a3>>(32-j)); + a3 = (a3<<(j))|(a3>>(32-j));\ +} -#define STEP(c0,c1)\ +#define STEP(c0,c1) { \ SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ MIXWORD(chainv[0],chainv[4]);\ MIXWORD(chainv[1],chainv[5]);\ MIXWORD(chainv[2],chainv[6]);\ MIXWORD(chainv[3],chainv[7]);\ - ADD_CONSTANT(chainv[0],chainv[4],c0,c1); + ADD_CONSTANT(chainv[0],chainv[4],c0,c1);\ +} #define SUBCRUMB(a0,a1,a2,a3,a4)\ a4 = a0;\ @@ -104,7 +76,7 @@ typedef struct { /* initial values of chaining variables */ __constant__ uint32_t c_IV[40]; -const uint32_t h2_IV[40] = { +static const uint32_t h_IV[40] = { 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, 0x6e292011,0x90152df4,0xee058139,0xdef610bb, 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, @@ -117,7 +89,7 @@ const uint32_t h2_IV[40] = { 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; __constant__ uint32_t c_CNS[80]; -uint32_t h2_CNS[80] = { +static const uint32_t h_CNS[80] = { 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, @@ -142,213 +114,305 @@ uint32_t h2_CNS[80] = { /***************************************************/ __device__ __forceinline__ -void rnd512(hashState *state) +void rnd512(uint32_t *statebuffer, uint32_t *statechainv) { int i,j; uint32_t t[40]; uint32_t chainv[8]; uint32_t tmp; -#pragma unroll 8 - for(i=0;i<8;i++) { + #pragma unroll 8 + for(i=0; i<8; i++) { t[i]=0; -#pragma unroll 5 - for(j=0;j<5;j++) { - t[i] ^= state->chainv[i+8*j]; - } + #pragma unroll 5 + for(j=0; j<5; j++) + t[i] ^= statechainv[i+8*j]; } MULT2(t, 0); -#pragma unroll 5 - for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= t[i]; - } + #pragma unroll 5 + for(j=0; j<5; j++) { + #pragma unroll 8 + for(i=0; i<8; i++) + statechainv[i+8*j] ^= t[i]; } -#pragma unroll 5 - for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; - } + #pragma unroll 5 + for(j=0; j<5; j++) { + #pragma unroll 8 + for(i=0; i<8; i++) + t[i+8*j] = statechainv[i+8*j]; } -#pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); - } + #pragma unroll + for(j=0; j<5; j++) + MULT2(statechainv, j); -#pragma unroll 5 - for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; - } + #pragma unroll 5 + for(j=0; j<5; j++) { + #pragma unroll 8 + for(i=0; i<8; i++) + statechainv[8*j+i] ^= t[8*((j+1)%5)+i]; } -#pragma unroll 5 - for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - t[i+8*j] = state->chainv[i+8*j]; - } + #pragma unroll 5 + for(j=0; j<5; j++) { + #pragma unroll 8 + for(i=0; i<8; i++) + t[i+8*j] = statechainv[i+8*j]; } -#pragma unroll 5 - for(j=0;j<5;j++) { - MULT2(state->chainv, j); - } + #pragma unroll + for(j=0; j<5; j++) + MULT2(statechainv, j); -#pragma unroll 5 - for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; - } + #pragma unroll 5 + for(j=0; j<5; j++) { + #pragma unroll 8 + for(i=0; i<8; i++) + statechainv[8*j+i] ^= t[8*((j+4)%5)+i]; } -#pragma unroll 5 - for(j=0;j<5;j++) { -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8*j] ^= state->buffer[i]; - } - MULT2(state->buffer, 0); + #pragma unroll 5 + for(j=0; j<5; j++) { + #pragma unroll 8 + for(i=0; i<8; i++) + statechainv[i+8*j] ^= statebuffer[i]; + MULT2(statebuffer, 0); } -#pragma unroll 8 - for(i=0;i<8;i++) { - chainv[i] = state->chainv[i]; - } + #pragma unroll + for(i=0; i<8; i++) + chainv[i] = statechainv[i]; -#pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); - } + #pragma unroll + for(i=0; i<8; i++) + STEP(c_CNS[(2*i)], c_CNS[(2*i)+1]); -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i] = chainv[i]; - chainv[i] = state->chainv[i+8]; + #pragma unroll + for(i=0; i<8; i++) { + statechainv[i] = chainv[i]; + chainv[i] = statechainv[i+8]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); -#pragma unroll 8 - for(i=0;i<8;i++) { - STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); - } + #pragma unroll + for(i=0; i<8; i++) + STEP(c_CNS[(2*i)+16], c_CNS[(2*i)+16+1]); -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+8] = chainv[i]; - chainv[i] = state->chainv[i+16]; + #pragma unroll + for(i=0; i<8; i++) { + statechainv[i+8] = chainv[i]; + chainv[i] = statechainv[i+16]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); -#pragma unroll 8 - for(i=0;i<8;i++) { + #pragma unroll + for(i=0; i<8; i++) STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); - } -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+16] = chainv[i]; - chainv[i] = state->chainv[i+24]; + #pragma unroll + for(i=0; i<8; i++) { + statechainv[i+16] = chainv[i]; + chainv[i] = statechainv[i+24]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); -#pragma unroll 8 - for(i=0;i<8;i++) { + #pragma unroll + for(i=0; i<8; i++) STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); - } -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+24] = chainv[i]; - chainv[i] = state->chainv[i+32]; + #pragma unroll + for(i=0; i<8; i++) { + statechainv[i+24] = chainv[i]; + chainv[i] = statechainv[i+32]; } TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); -#pragma unroll 8 - for(i=0;i<8;i++) { + #pragma unroll + for(i=0; i<8; i++) STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); + + #pragma unroll 8 + for(i=0; i<8; i++) + statechainv[i+32] = chainv[i]; +} + +static void rnd512_cpu(uint32_t *statebuffer, uint32_t *statechainv) +{ + int i, j; + uint32_t t[40]; + uint32_t chainv[8]; + uint32_t tmp; + + for (i = 0; i<8; i++) { + t[i] = statechainv[i]; + for (j = 1; j<5; j++) + t[i] ^= statechainv[i + 8 * j]; } -#pragma unroll 8 - for(i=0;i<8;i++) { - state->chainv[i+32] = chainv[i]; + MULT2(t, 0); + + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) + statechainv[i + 8 * j] ^= t[i]; + } + + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) + t[i + 8 * j] = statechainv[i + 8 * j]; } -} + for (j = 0; j<5; j++) + MULT2(statechainv, j); + + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) + statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; + } + + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) + t[i + 8 * j] = statechainv[i + 8 * j]; + } + + for (j = 0; j<5; j++) + MULT2(statechainv, j); + + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) + statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; + } + for (j = 0; j<5; j++) { + for (i = 0; i<8; i++) + statechainv[i + 8 * j] ^= statebuffer[i]; + MULT2(statebuffer, 0); + } + + for (i = 0; i<8; i++) + chainv[i] = statechainv[i]; + + for (i = 0; i<8; i++) + STEP(h_CNS[(2 * i)], h_CNS[(2 * i) + 1]); + + for (i = 0; i<8; i++) { + statechainv[i] = chainv[i]; + chainv[i] = statechainv[i + 8]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); + + for (i = 0; i<8; i++) + STEP(h_CNS[(2 * i) + 16], h_CNS[(2 * i) + 16 + 1]); + + for (i = 0; i<8; i++) { + statechainv[i + 8] = chainv[i]; + chainv[i] = statechainv[i + 16]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); + + for (i = 0; i<8; i++) + STEP(h_CNS[(2 * i) + 32], h_CNS[(2 * i) + 32 + 1]); + + for (i = 0; i<8; i++) { + statechainv[i + 16] = chainv[i]; + chainv[i] = statechainv[i + 24]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); + + for (i = 0; i<8; i++) + STEP(h_CNS[(2 * i) + 48], h_CNS[(2 * i) + 48 + 1]); + + for (i = 0; i<8; i++) { + statechainv[i + 24] = chainv[i]; + chainv[i] = statechainv[i + 32]; + } + + TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); + + for (i = 0; i<8; i++) + STEP(h_CNS[(2 * i) + 64], h_CNS[(2 * i) + 64 + 1]); + + for (i = 0; i<8; i++) + statechainv[i + 32] = chainv[i]; +} + +/***************************************************/ __device__ __forceinline__ -void Update512(hashState *state, const BitSequence *data) +void Update512(uint32_t* statebuffer, uint32_t *statechainv, const uint32_t *const __restrict__ data) { -#pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]); - rnd512(state); - -#pragma unroll 8 - for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]); - rnd512(state); -#pragma unroll 4 - for(int i=0;i<4;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+64))[i]); + #pragma unroll + for (int i = 0; i<8; i++) + statebuffer[i] = cuda_swab32((data[i])); + rnd512(statebuffer, statechainv); + + #pragma unroll + for(int i=0; i<8; i++) + statebuffer[i] = cuda_swab32((data[i+8])); + rnd512(statebuffer, statechainv); + + #pragma unroll + for(int i=0; i<4; i++) + statebuffer[i] = cuda_swab32((data[i+16])); } /***************************************************/ __device__ __forceinline__ -void finalization512(hashState *state, uint32_t *b) +void finalization512(uint32_t* statebuffer, uint32_t *statechainv, uint32_t *b) { int i,j; - state->buffer[4] = 0x80000000; -#pragma unroll 3 - for(int i=5;i<8;i++) state->buffer[i] = 0; - rnd512(state); + statebuffer[4] = 0x80000000U; + + #pragma unroll 3 + for(int i=5; i<8; i++) + statebuffer[i] = 0; + rnd512(statebuffer, statechainv); /*---- blank round with m=0 ----*/ -#pragma unroll 8 - for(i=0;i<8;i++) state->buffer[i] =0; - rnd512(state); + #pragma unroll + for(i=0; i<8; i++) + statebuffer[i] =0; + rnd512(statebuffer, statechainv); -#pragma unroll 8 - for(i=0;i<8;i++) { + #pragma unroll + for(i=0; i<8; i++) { b[i] = 0; -#pragma unroll 5 - for(j=0;j<5;j++) { - b[i] ^= state->chainv[i+8*j]; - } - b[i] = BYTES_SWAP32((b[i])); + #pragma unroll 5 + for(j=0; j<5; j++) + b[i] ^= statechainv[i+8*j]; + b[i] = cuda_swab32((b[i])); } -#pragma unroll 8 - for(i=0;i<8;i++) state->buffer[i]=0; - rnd512(state); + #pragma unroll + for(i=0; i<8; i++) + statebuffer[i]=0; + rnd512(statebuffer, statechainv); -#pragma unroll 8 - for(i=0;i<8;i++) { + #pragma unroll + for(i=0; i<8; i++) + { b[8+i] = 0; -#pragma unroll 5 - for(j=0;j<5;j++) { - b[8+i] ^= state->chainv[i+8*j]; - } - b[8+i] = BYTES_SWAP32((b[8+i])); + #pragma unroll 5 + for(j=0; j<5; j++) + b[8+i] ^= statechainv[i+8*j]; + b[8+i] = cuda_swab32((b[8+i])); } } /***************************************************/ -// Die Hash-Funktion __global__ -void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) +void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -359,118 +423,37 @@ void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou uint32_t buf32[32]; } buff; -#pragma unroll 16 - for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; + #pragma unroll 8 + for (int i=8; i < 16; i++) + buff.buf64[i] = c_PaddedMessage80[i]; // die Nounce durch die thread-spezifische ersetzen buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce)); + uint32_t statebuffer[8], statechainv[40]; - hashState state; -#pragma unroll 40 - for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; -#pragma unroll 8 - for(int i=0;i<8;i++) state.buffer[i] = 0; - Update512(&state, (BitSequence*)buff.buf32); - uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; - finalization512(&state, (uint32_t*)outHash); - } -} - -__global__ -void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = startNounce + thread; - union { - uint64_t buf64[16]; - uint32_t buf32[32]; - } buff; - uint32_t Hash[16]; - - #pragma unroll 16 - for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; - - // Tested nonce - buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce)); - - hashState state; - #pragma unroll 40 - for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; + #pragma unroll + for (int i = 0; i<4; i++) + statebuffer[i] = cuda_swab32(buff.buf32[i + 16]); - #pragma unroll 8 - for(int i=0;i<8;i++) state.buffer[i] = 0; + #pragma unroll 4 + for (int i = 4; i<8; i++) + statebuffer[i] = statebufferpre[i]; - Update512(&state, (BitSequence*)buff.buf32); - finalization512(&state, Hash); + #pragma unroll + for (int i = 0; i<40; i++) + statechainv[i] = statechainvpre[i]; - /* dont ask me why not a simple if (Hash[i] > c_Target[i]) return; - * we lose 20% in perfs without the position test */ - int position = -1; - #pragma unroll 8 - for (int i = 7; i >= 0; i--) { - if (Hash[i] > c_Target[i]) { - if (position < i) { - return; - } - } - if (Hash[i] < c_Target[i]) { - if (position < i) { - position = i; - //break; /* impact perfs, unroll ? */ - } - } - } - -#if NBN == 1 - if (resNounce[0] > nounce) { - resNounce[0] = nounce; - } -#else - /* keep the smallest nounce, + extra one if found */ - if (resNounce[0] > nounce) { - resNounce[1] = resNounce[0]; - resNounce[0] = nounce; - } else { - resNounce[1] = nounce; - } -#endif + uint32_t *outHash = &outputHash[thread * 16]; + finalization512(statebuffer, statechainv, outHash); } } __host__ void qubit_luffa512_cpu_init(int thr_id, uint32_t threads) { - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], NBN * sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], NBN * sizeof(uint32_t))); -} - -__host__ -uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order) -{ - uint32_t result = UINT32_MAX; - cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t)); - const uint32_t threadsperblock = 256; - - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - size_t shared_size = 0; - - qubit_luffa512_gpu_finalhash_80 <<>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]); - cudaThreadSynchronize(); - if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) { - //cudaThreadSynchronize(); - result = h_resNounce[thr_id][0]; -#if NBN > 1 - extra_results[0] = h_resNounce[thr_id][1]; -#endif - } - return result; + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice)); } __host__ @@ -486,22 +469,38 @@ void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun } __host__ -void qubit_luffa512_cpu_setBlock_80(void *pdata) +void qubit_cpu_precalc(uint32_t* message) { - unsigned char PaddedMessage[128]; - - memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 48); - PaddedMessage[80] = 0x80; - PaddedMessage[111] = 1; - PaddedMessage[126] = 0x02; - PaddedMessage[127] = 0x80; - - CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); + uint32_t statebuffer[8]; + uint32_t statechainv[40] = + { + 0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465, + 0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb, + 0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3, + 0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581, + 0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05, + 0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7, + 0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67, + 0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce, + 0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363, + 0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea + }; + + for (int i = 0; i<8; i++) + statebuffer[i] = cuda_swab32(message[i]); + rnd512_cpu(statebuffer, statechainv); + + for (int i = 0; i<8; i++) + statebuffer[i] = cuda_swab32(message[i+8]); + + rnd512_cpu(statebuffer, statechainv); + + cudaMemcpyToSymbol(statebufferpre, statebuffer, sizeof(statebuffer), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(statechainvpre, statechainv, sizeof(statechainv), 0, cudaMemcpyHostToDevice); } __host__ -void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget) +void qubit_luffa512_cpu_setBlock_80(void *pdata) { unsigned char PaddedMessage[128]; @@ -512,6 +511,6 @@ void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget) PaddedMessage[126] = 0x02; PaddedMessage[127] = 0x80; - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)); + qubit_cpu_precalc((uint32_t*) PaddedMessage); }