Browse Source

qubit: implement cpu precalc (klaust)

improve qubit (+5%) deep and doom (+10%) hashrate

based on klausT code, simplified...
2upstream
Tanguy Pruvot 10 years ago
parent
commit
3e419abf84
  1. 27
      qubit/deep.cu
  2. 40
      qubit/doom.cu
  3. 567
      qubit/qubit_luffa512.cu

27
qubit/deep.cu

@ -19,8 +19,6 @@ static uint32_t *d_hash[MAX_GPUS];
extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
extern void qubit_luffa512_cpu_setBlock_80(void *pdata); extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget);
extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads); extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads);
extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@ -30,13 +28,13 @@ extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t start
extern "C" void deephash(void *state, const void *input) extern "C" void deephash(void *state, const void *input)
{ {
// luffa1-cubehash2-shavite3-simd4-echo5 uint8_t _ALIGN(64) hash[64];
// luffa-80 cubehash-64 echo-64
sph_luffa512_context ctx_luffa; sph_luffa512_context ctx_luffa;
sph_cubehash512_context ctx_cubehash; sph_cubehash512_context ctx_cubehash;
sph_echo512_context ctx_echo; sph_echo512_context ctx_echo;
uint8_t hash[64];
sph_luffa512_init(&ctx_luffa); sph_luffa512_init(&ctx_luffa);
sph_luffa512 (&ctx_luffa, input, 80); sph_luffa512 (&ctx_luffa, input, 80);
sph_luffa512_close(&ctx_luffa, (void*) hash); sph_luffa512_close(&ctx_luffa, (void*) hash);
@ -54,12 +52,11 @@ extern "C" void deephash(void *state, const void *input)
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
const uint32_t *ptarget, uint32_t max_nonce, uint32_t max_nonce, unsigned long *hashes_done)
unsigned long *hashes_done)
{ {
uint32_t _ALIGN(64) endiandata[20];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20];
uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8 uint32_t throughput = device_intensity(thr_id, __func__, 1U << 19); // 256*256*8
throughput = min(throughput, (max_nonce - first_nonce)); throughput = min(throughput, (max_nonce - first_nonce));
@ -69,7 +66,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
if (!init[thr_id]) if (!init[thr_id])
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
qubit_luffa512_cpu_init(thr_id, throughput); qubit_luffa512_cpu_init(thr_id, throughput);
x11_cubehash512_cpu_init(thr_id, throughput); x11_cubehash512_cpu_init(thr_id, throughput);
@ -80,10 +78,10 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
init[thr_id] = true; init[thr_id] = true;
} }
for (int k=0; k < 20; k++) for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]); be32enc(&endiandata[k], pdata[k]);
qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget); qubit_luffa512_cpu_setBlock_80((void*)endiandata);
cuda_check_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget);
do { do {
@ -96,12 +94,11 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != UINT32_MAX) if (foundNonce != UINT32_MAX)
{ {
const uint32_t Htarg = ptarget[7]; uint32_t _ALIGN(64) vhash64[8];
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce); be32enc(&endiandata[19], foundNonce);
deephash(vhash64, endiandata); deephash(vhash64, endiandata);
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
int res = 1; int res = 1;
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
*hashes_done = pdata[19] - first_nonce + throughput; *hashes_done = pdata[19] - first_nonce + throughput;

40
qubit/doom.cu

@ -15,15 +15,12 @@ static uint32_t *d_hash[MAX_GPUS];
extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
extern void qubit_luffa512_cpu_setBlock_80(void *pdata); extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget);
extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void doomhash(void *state, const void *input) extern "C" void doomhash(void *state, const void *input)
{ {
// luffa512 uint8_t _ALIGN(64) hash[64];
sph_luffa512_context ctx_luffa;
uint8_t hash[64]; sph_luffa512_context ctx_luffa;
sph_luffa512_init(&ctx_luffa); sph_luffa512_init(&ctx_luffa);
sph_luffa512 (&ctx_luffa, input, 80); sph_luffa512 (&ctx_luffa, input, 80);
@ -34,12 +31,11 @@ extern void doomhash(void *state, const void *input)
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
const uint32_t *ptarget, uint32_t max_nonce, uint32_t max_nonce, unsigned long *hashes_done)
unsigned long *hashes_done)
{ {
uint32_t _ALIGN(64) endiandata[20];
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
uint32_t endiandata[20];
uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); // 256*256*8*8 uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); // 256*256*8*8
throughput = min(throughput, max_nonce - first_nonce); throughput = min(throughput, max_nonce - first_nonce);
@ -50,31 +46,35 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
qubit_luffa512_cpu_init(thr_id, (int) throughput); qubit_luffa512_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true; init[thr_id] = true;
} }
for (int k=0; k < 20; k++) for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]); be32enc(&endiandata[k], pdata[k]);
qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget); qubit_luffa512_cpu_setBlock_80((void*)endiandata);
cuda_check_cpu_setTarget(ptarget);
do { do {
int order = 0; int order = 0;
*hashes_done = pdata[19] - first_nonce + throughput;
qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++); uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != UINT32_MAX) if (foundNonce != UINT32_MAX)
{ {
const uint32_t Htarg = ptarget[7]; uint32_t _ALIGN(64) vhash64[8];
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce); be32enc(&endiandata[19], foundNonce);
doomhash(vhash64, endiandata); doomhash(vhash64, endiandata);
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
*hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput); //*hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput);
pdata[19] = foundNonce; pdata[19] = foundNonce;
return 1; return 1;
} else { } else {
@ -82,7 +82,7 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
} }
} }
if ((uint64_t) pdata[19] + throughput > max_nonce) { if ((uint64_t) throughput + pdata[19] > max_nonce) {
// pdata[19] = max_nonce; // pdata[19] = max_nonce;
break; break;
} }

567
qubit/qubit_luffa512.cu

@ -1,49 +1,18 @@
/* /*******************************************************************************
* luffa_for_32.c * luffa512 for 80-bytes input (with midstate precalc by klausT)
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/ */
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <stdint.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h" #include "cuda_helper.h"
typedef unsigned char BitSequence; static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
static __constant__ uint32_t statebufferpre[8];
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) static __constant__ uint32_t statechainvpre[40];
__constant__ uint32_t c_Target[8];
static uint32_t *h_resNounce[MAX_GPUS];
static uint32_t *d_resNounce[MAX_GPUS];
#define NBN 1 /* max results, could be 2, see blake32.cu */
#if NBN > 1
static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
#endif
typedef struct { #define MULT2(a,j) {\
uint32_t buffer[8]; /* Buffer to be hashed */
uint32_t chainv[40]; /* Chaining values */
} hashState;
#define BYTES_SWAP32(x) cuda_swab32(x)
#define MULT2(a,j)\
tmp = a[7+(8*j)];\ tmp = a[7+(8*j)];\
a[7+(8*j)] = a[6+(8*j)];\ a[7+(8*j)] = a[6+(8*j)];\
a[6+(8*j)] = a[5+(8*j)];\ a[6+(8*j)] = a[5+(8*j)];\
@ -52,22 +21,25 @@ typedef struct {
a[3+(8*j)] = a[2+(8*j)] ^ tmp;\ a[3+(8*j)] = a[2+(8*j)] ^ tmp;\
a[2+(8*j)] = a[1+(8*j)];\ a[2+(8*j)] = a[1+(8*j)];\
a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
a[0+(8*j)] = tmp; a[0+(8*j)] = tmp;\
}
#define TWEAK(a0,a1,a2,a3,j)\ #define TWEAK(a0,a1,a2,a3,j) { \
a0 = (a0<<(j))|(a0>>(32-j));\ a0 = (a0<<(j))|(a0>>(32-j));\
a1 = (a1<<(j))|(a1>>(32-j));\ a1 = (a1<<(j))|(a1>>(32-j));\
a2 = (a2<<(j))|(a2>>(32-j));\ a2 = (a2<<(j))|(a2>>(32-j));\
a3 = (a3<<(j))|(a3>>(32-j)); a3 = (a3<<(j))|(a3>>(32-j));\
}
#define STEP(c0,c1)\ #define STEP(c0,c1) { \
SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
MIXWORD(chainv[0],chainv[4]);\ MIXWORD(chainv[0],chainv[4]);\
MIXWORD(chainv[1],chainv[5]);\ MIXWORD(chainv[1],chainv[5]);\
MIXWORD(chainv[2],chainv[6]);\ MIXWORD(chainv[2],chainv[6]);\
MIXWORD(chainv[3],chainv[7]);\ MIXWORD(chainv[3],chainv[7]);\
ADD_CONSTANT(chainv[0],chainv[4],c0,c1); ADD_CONSTANT(chainv[0],chainv[4],c0,c1);\
}
#define SUBCRUMB(a0,a1,a2,a3,a4)\ #define SUBCRUMB(a0,a1,a2,a3,a4)\
a4 = a0;\ a4 = a0;\
@ -104,7 +76,7 @@ typedef struct {
/* initial values of chaining variables */ /* initial values of chaining variables */
__constant__ uint32_t c_IV[40]; __constant__ uint32_t c_IV[40];
const uint32_t h2_IV[40] = { static const uint32_t h_IV[40] = {
0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, 0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
0x6e292011,0x90152df4,0xee058139,0xdef610bb, 0x6e292011,0x90152df4,0xee058139,0xdef610bb,
0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, 0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
@ -117,7 +89,7 @@ const uint32_t h2_IV[40] = {
0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; 0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
__constant__ uint32_t c_CNS[80]; __constant__ uint32_t c_CNS[80];
uint32_t h2_CNS[80] = { static const uint32_t h_CNS[80] = {
0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, 0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, 0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, 0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
@ -142,213 +114,305 @@ uint32_t h2_CNS[80] = {
/***************************************************/ /***************************************************/
__device__ __forceinline__ __device__ __forceinline__
void rnd512(hashState *state) void rnd512(uint32_t *statebuffer, uint32_t *statechainv)
{ {
int i,j; int i,j;
uint32_t t[40]; uint32_t t[40];
uint32_t chainv[8]; uint32_t chainv[8];
uint32_t tmp; uint32_t tmp;
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0; i<8; i++) {
t[i]=0; t[i]=0;
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0; j<5; j++)
t[i] ^= state->chainv[i+8*j]; t[i] ^= statechainv[i+8*j];
}
} }
MULT2(t, 0); MULT2(t, 0);
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0; j<5; j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0; i<8; i++)
state->chainv[i+8*j] ^= t[i]; statechainv[i+8*j] ^= t[i];
}
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0; j<5; j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0; i<8; i++)
t[i+8*j] = state->chainv[i+8*j]; t[i+8*j] = statechainv[i+8*j];
}
} }
#pragma unroll 5 #pragma unroll
for(j=0;j<5;j++) { for(j=0; j<5; j++)
MULT2(state->chainv, j); MULT2(statechainv, j);
}
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0; j<5; j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0; i<8; i++)
state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; statechainv[8*j+i] ^= t[8*((j+1)%5)+i];
}
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0; j<5; j++) {
#pragma unroll 8 #pragma unroll 8
for(i=0;i<8;i++) { for(i=0; i<8; i++)
t[i+8*j] = state->chainv[i+8*j]; t[i+8*j] = statechainv[i+8*j];
} }
#pragma unroll
for(j=0; j<5; j++)
MULT2(statechainv, j);
#pragma unroll 5
for(j=0; j<5; j++) {
#pragma unroll 8
for(i=0; i<8; i++)
statechainv[8*j+i] ^= t[8*((j+4)%5)+i];
} }
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0; j<5; j++) {
MULT2(state->chainv, j); #pragma unroll 8
for(i=0; i<8; i++)
statechainv[i+8*j] ^= statebuffer[i];
MULT2(statebuffer, 0);
} }
#pragma unroll 5 #pragma unroll
for(j=0;j<5;j++) { for(i=0; i<8; i++)
#pragma unroll 8 chainv[i] = statechainv[i];
for(i=0;i<8;i++) {
state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; #pragma unroll
for(i=0; i<8; i++)
STEP(c_CNS[(2*i)], c_CNS[(2*i)+1]);
#pragma unroll
for(i=0; i<8; i++) {
statechainv[i] = chainv[i];
chainv[i] = statechainv[i+8];
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
#pragma unroll
for(i=0; i<8; i++)
STEP(c_CNS[(2*i)+16], c_CNS[(2*i)+16+1]);
#pragma unroll
for(i=0; i<8; i++) {
statechainv[i+8] = chainv[i];
chainv[i] = statechainv[i+16];
} }
#pragma unroll 5 TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
for(j=0;j<5;j++) {
#pragma unroll 8 #pragma unroll
for(i=0;i<8;i++) { for(i=0; i<8; i++)
state->chainv[i+8*j] ^= state->buffer[i]; STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
#pragma unroll
for(i=0; i<8; i++) {
statechainv[i+16] = chainv[i];
chainv[i] = statechainv[i+24];
} }
MULT2(state->buffer, 0);
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
#pragma unroll
for(i=0; i<8; i++)
STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
#pragma unroll
for(i=0; i<8; i++) {
statechainv[i+24] = chainv[i];
chainv[i] = statechainv[i+32];
} }
#pragma unroll 8 TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
for(i=0;i<8;i++) {
chainv[i] = state->chainv[i]; #pragma unroll
for(i=0; i<8; i++)
STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
#pragma unroll 8
for(i=0; i<8; i++)
statechainv[i+32] = chainv[i];
}
static void rnd512_cpu(uint32_t *statebuffer, uint32_t *statechainv)
{
int i, j;
uint32_t t[40];
uint32_t chainv[8];
uint32_t tmp;
for (i = 0; i<8; i++) {
t[i] = statechainv[i];
for (j = 1; j<5; j++)
t[i] ^= statechainv[i + 8 * j];
} }
#pragma unroll 8 MULT2(t, 0);
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); for (j = 0; j<5; j++) {
for (i = 0; i<8; i++)
statechainv[i + 8 * j] ^= t[i];
} }
#pragma unroll 8 for (j = 0; j<5; j++) {
for(i=0;i<8;i++) { for (i = 0; i<8; i++)
state->chainv[i] = chainv[i]; t[i + 8 * j] = statechainv[i + 8 * j];
chainv[i] = state->chainv[i+8];
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); for (j = 0; j<5; j++)
MULT2(statechainv, j);
#pragma unroll 8 for (j = 0; j<5; j++) {
for(i=0;i<8;i++) { for (i = 0; i<8; i++)
STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
} }
#pragma unroll 8 for (j = 0; j<5; j++) {
for(i=0;i<8;i++) { for (i = 0; i<8; i++)
state->chainv[i+8] = chainv[i]; t[i + 8 * j] = statechainv[i + 8 * j];
chainv[i] = state->chainv[i+16];
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); for (j = 0; j<5; j++)
MULT2(statechainv, j);
#pragma unroll 8 for (j = 0; j<5; j++) {
for(i=0;i<8;i++) { for (i = 0; i<8; i++)
STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
} }
#pragma unroll 8 for (j = 0; j<5; j++) {
for(i=0;i<8;i++) { for (i = 0; i<8; i++)
state->chainv[i+16] = chainv[i]; statechainv[i + 8 * j] ^= statebuffer[i];
chainv[i] = state->chainv[i+24]; MULT2(statebuffer, 0);
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); for (i = 0; i<8; i++)
chainv[i] = statechainv[i];
#pragma unroll 8 for (i = 0; i<8; i++)
for(i=0;i<8;i++) { STEP(h_CNS[(2 * i)], h_CNS[(2 * i) + 1]);
STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
for (i = 0; i<8; i++) {
statechainv[i] = chainv[i];
chainv[i] = statechainv[i + 8];
} }
#pragma unroll 8 TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
for(i=0;i<8;i++) {
state->chainv[i+24] = chainv[i]; for (i = 0; i<8; i++)
chainv[i] = state->chainv[i+32]; STEP(h_CNS[(2 * i) + 16], h_CNS[(2 * i) + 16 + 1]);
for (i = 0; i<8; i++) {
statechainv[i + 8] = chainv[i];
chainv[i] = statechainv[i + 16];
} }
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
#pragma unroll 8 for (i = 0; i<8; i++)
for(i=0;i<8;i++) { STEP(h_CNS[(2 * i) + 32], h_CNS[(2 * i) + 32 + 1]);
STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
for (i = 0; i<8; i++) {
statechainv[i + 16] = chainv[i];
chainv[i] = statechainv[i + 24];
} }
#pragma unroll 8 TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
for(i=0;i<8;i++) {
state->chainv[i+32] = chainv[i]; for (i = 0; i<8; i++)
STEP(h_CNS[(2 * i) + 48], h_CNS[(2 * i) + 48 + 1]);
for (i = 0; i<8; i++) {
statechainv[i + 24] = chainv[i];
chainv[i] = statechainv[i + 32];
} }
}
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
for (i = 0; i<8; i++)
STEP(h_CNS[(2 * i) + 64], h_CNS[(2 * i) + 64 + 1]);
for (i = 0; i<8; i++)
statechainv[i + 32] = chainv[i];
}
/***************************************************/
__device__ __forceinline__ __device__ __forceinline__
void Update512(hashState *state, const BitSequence *data) void Update512(uint32_t* statebuffer, uint32_t *statechainv, const uint32_t *const __restrict__ data)
{ {
#pragma unroll 8 #pragma unroll
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]); for (int i = 0; i<8; i++)
rnd512(state); statebuffer[i] = cuda_swab32((data[i]));
rnd512(statebuffer, statechainv);
#pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]); #pragma unroll
rnd512(state); for(int i=0; i<8; i++)
#pragma unroll 4 statebuffer[i] = cuda_swab32((data[i+8]));
for(int i=0;i<4;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+64))[i]); rnd512(statebuffer, statechainv);
#pragma unroll
for(int i=0; i<4; i++)
statebuffer[i] = cuda_swab32((data[i+16]));
} }
/***************************************************/ /***************************************************/
__device__ __forceinline__ __device__ __forceinline__
void finalization512(hashState *state, uint32_t *b) void finalization512(uint32_t* statebuffer, uint32_t *statechainv, uint32_t *b)
{ {
int i,j; int i,j;
state->buffer[4] = 0x80000000; statebuffer[4] = 0x80000000U;
#pragma unroll 3
for(int i=5;i<8;i++) state->buffer[i] = 0; #pragma unroll 3
rnd512(state); for(int i=5; i<8; i++)
statebuffer[i] = 0;
rnd512(statebuffer, statechainv);
/*---- blank round with m=0 ----*/ /*---- blank round with m=0 ----*/
#pragma unroll 8 #pragma unroll
for(i=0;i<8;i++) state->buffer[i] =0; for(i=0; i<8; i++)
rnd512(state); statebuffer[i] =0;
rnd512(statebuffer, statechainv);
#pragma unroll 8 #pragma unroll
for(i=0;i<8;i++) { for(i=0; i<8; i++) {
b[i] = 0; b[i] = 0;
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0; j<5; j++)
b[i] ^= state->chainv[i+8*j]; b[i] ^= statechainv[i+8*j];
} b[i] = cuda_swab32((b[i]));
b[i] = BYTES_SWAP32((b[i]));
} }
#pragma unroll 8 #pragma unroll
for(i=0;i<8;i++) state->buffer[i]=0; for(i=0; i<8; i++)
rnd512(state); statebuffer[i]=0;
rnd512(statebuffer, statechainv);
#pragma unroll 8 #pragma unroll
for(i=0;i<8;i++) { for(i=0; i<8; i++)
{
b[8+i] = 0; b[8+i] = 0;
#pragma unroll 5 #pragma unroll 5
for(j=0;j<5;j++) { for(j=0; j<5; j++)
b[8+i] ^= state->chainv[i+8*j]; b[8+i] ^= statechainv[i+8*j];
} b[8+i] = cuda_swab32((b[8+i]));
b[8+i] = BYTES_SWAP32((b[8+i]));
} }
} }
/***************************************************/ /***************************************************/
// Die Hash-Funktion
__global__ __global__
void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
{ {
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -359,118 +423,37 @@ void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
uint32_t buf32[32]; uint32_t buf32[32];
} buff; } buff;
#pragma unroll 16 #pragma unroll 8
for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; for (int i=8; i < 16; i++)
buff.buf64[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen // die Nounce durch die thread-spezifische ersetzen
buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce)); buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce));
uint32_t statebuffer[8], statechainv[40];
hashState state; #pragma unroll
#pragma unroll 40 for (int i = 0; i<4; i++)
for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; statebuffer[i] = cuda_swab32(buff.buf32[i + 16]);
#pragma unroll 8
for(int i=0;i<8;i++) state.buffer[i] = 0;
Update512(&state, (BitSequence*)buff.buf32);
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
finalization512(&state, (uint32_t*)outHash);
}
}
__global__
void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
union {
uint64_t buf64[16];
uint32_t buf32[32];
} buff;
uint32_t Hash[16];
#pragma unroll 16
for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
// Tested nonce
buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce));
hashState state;
#pragma unroll 40
for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
#pragma unroll 8 #pragma unroll 4
for(int i=0;i<8;i++) state.buffer[i] = 0; for (int i = 4; i<8; i++)
statebuffer[i] = statebufferpre[i];
Update512(&state, (BitSequence*)buff.buf32); #pragma unroll
finalization512(&state, Hash); for (int i = 0; i<40; i++)
statechainv[i] = statechainvpre[i];
/* dont ask me why not a simple if (Hash[i] > c_Target[i]) return; uint32_t *outHash = &outputHash[thread * 16];
* we lose 20% in perfs without the position test */ finalization512(statebuffer, statechainv, outHash);
int position = -1;
#pragma unroll 8
for (int i = 7; i >= 0; i--) {
if (Hash[i] > c_Target[i]) {
if (position < i) {
return;
}
}
if (Hash[i] < c_Target[i]) {
if (position < i) {
position = i;
//break; /* impact perfs, unroll ? */
}
}
}
#if NBN == 1
if (resNounce[0] > nounce) {
resNounce[0] = nounce;
}
#else
/* keep the smallest nounce, + extra one if found */
if (resNounce[0] > nounce) {
resNounce[1] = resNounce[0];
resNounce[0] = nounce;
} else {
resNounce[1] = nounce;
}
#endif
} }
} }
__host__ __host__
void qubit_luffa512_cpu_init(int thr_id, uint32_t threads) void qubit_luffa512_cpu_init(int thr_id, uint32_t threads)
{ {
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], NBN * sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], NBN * sizeof(uint32_t)));
}
__host__
uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
{
uint32_t result = UINT32_MAX;
cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t));
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
qubit_luffa512_gpu_finalhash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]);
cudaThreadSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
//cudaThreadSynchronize();
result = h_resNounce[thr_id][0];
#if NBN > 1
extra_results[0] = h_resNounce[thr_id][1];
#endif
}
return result;
} }
__host__ __host__
@ -486,22 +469,38 @@ void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
} }
__host__ __host__
void qubit_luffa512_cpu_setBlock_80(void *pdata) void qubit_cpu_precalc(uint32_t* message)
{ {
unsigned char PaddedMessage[128]; uint32_t statebuffer[8];
uint32_t statechainv[40] =
memcpy(PaddedMessage, pdata, 80); {
memset(PaddedMessage+80, 0, 48); 0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465,
PaddedMessage[80] = 0x80; 0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb,
PaddedMessage[111] = 1; 0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3,
PaddedMessage[126] = 0x02; 0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581,
PaddedMessage[127] = 0x80; 0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05,
0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7,
CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); 0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67,
0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce,
0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363,
0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea
};
for (int i = 0; i<8; i++)
statebuffer[i] = cuda_swab32(message[i]);
rnd512_cpu(statebuffer, statechainv);
for (int i = 0; i<8; i++)
statebuffer[i] = cuda_swab32(message[i+8]);
rnd512_cpu(statebuffer, statechainv);
cudaMemcpyToSymbol(statebufferpre, statebuffer, sizeof(statebuffer), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(statechainvpre, statechainv, sizeof(statechainv), 0, cudaMemcpyHostToDevice);
} }
__host__ __host__
void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget) void qubit_luffa512_cpu_setBlock_80(void *pdata)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
@ -512,6 +511,6 @@ void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)
PaddedMessage[126] = 0x02; PaddedMessage[126] = 0x02;
PaddedMessage[127] = 0x80; PaddedMessage[127] = 0x80;
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); qubit_cpu_precalc((uint32_t*) PaddedMessage);
} }

Loading…
Cancel
Save