|
|
|
@ -1,49 +1,18 @@
@@ -1,49 +1,18 @@
|
|
|
|
|
/* |
|
|
|
|
* luffa_for_32.c |
|
|
|
|
* Version 2.0 (Sep 15th 2009) |
|
|
|
|
* |
|
|
|
|
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved. |
|
|
|
|
* |
|
|
|
|
* Hitachi, Ltd. is the owner of this software and hereby grant |
|
|
|
|
* the U.S. Government and any interested party the right to use |
|
|
|
|
* this software for the purposes of the SHA-3 evaluation process, |
|
|
|
|
* notwithstanding that this software is copyrighted. |
|
|
|
|
* |
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
|
|
|
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
|
|
|
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
|
|
|
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
|
|
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
|
|
|
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
|
|
|
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
|
|
|
|
/******************************************************************************* |
|
|
|
|
* luffa512 for 80-bytes input (with midstate precalc by klausT) |
|
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
#include <stdio.h> |
|
|
|
|
#include <stdint.h> |
|
|
|
|
#include <memory.h> |
|
|
|
|
|
|
|
|
|
#include "cuda_helper.h" |
|
|
|
|
|
|
|
|
|
typedef unsigned char BitSequence; |
|
|
|
|
|
|
|
|
|
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) |
|
|
|
|
__constant__ uint32_t c_Target[8]; |
|
|
|
|
|
|
|
|
|
static uint32_t *h_resNounce[MAX_GPUS]; |
|
|
|
|
static uint32_t *d_resNounce[MAX_GPUS]; |
|
|
|
|
|
|
|
|
|
#define NBN 1 /* max results, could be 2, see blake32.cu */ |
|
|
|
|
#if NBN > 1 |
|
|
|
|
static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX }; |
|
|
|
|
#endif |
|
|
|
|
static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) |
|
|
|
|
static __constant__ uint32_t statebufferpre[8]; |
|
|
|
|
static __constant__ uint32_t statechainvpre[40]; |
|
|
|
|
|
|
|
|
|
typedef struct { |
|
|
|
|
uint32_t buffer[8]; /* Buffer to be hashed */ |
|
|
|
|
uint32_t chainv[40]; /* Chaining values */ |
|
|
|
|
} hashState; |
|
|
|
|
|
|
|
|
|
#define BYTES_SWAP32(x) cuda_swab32(x) |
|
|
|
|
|
|
|
|
|
#define MULT2(a,j)\ |
|
|
|
|
#define MULT2(a,j) {\ |
|
|
|
|
tmp = a[7+(8*j)];\ |
|
|
|
|
a[7+(8*j)] = a[6+(8*j)];\ |
|
|
|
|
a[6+(8*j)] = a[5+(8*j)];\ |
|
|
|
@ -52,22 +21,25 @@ typedef struct {
@@ -52,22 +21,25 @@ typedef struct {
|
|
|
|
|
a[3+(8*j)] = a[2+(8*j)] ^ tmp;\ |
|
|
|
|
a[2+(8*j)] = a[1+(8*j)];\ |
|
|
|
|
a[1+(8*j)] = a[0+(8*j)] ^ tmp;\ |
|
|
|
|
a[0+(8*j)] = tmp; |
|
|
|
|
a[0+(8*j)] = tmp;\ |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define TWEAK(a0,a1,a2,a3,j)\ |
|
|
|
|
#define TWEAK(a0,a1,a2,a3,j) { \ |
|
|
|
|
a0 = (a0<<(j))|(a0>>(32-j));\ |
|
|
|
|
a1 = (a1<<(j))|(a1>>(32-j));\ |
|
|
|
|
a2 = (a2<<(j))|(a2>>(32-j));\ |
|
|
|
|
a3 = (a3<<(j))|(a3>>(32-j)); |
|
|
|
|
a3 = (a3<<(j))|(a3>>(32-j));\ |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define STEP(c0,c1)\ |
|
|
|
|
#define STEP(c0,c1) { \ |
|
|
|
|
SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\ |
|
|
|
|
SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\ |
|
|
|
|
MIXWORD(chainv[0],chainv[4]);\ |
|
|
|
|
MIXWORD(chainv[1],chainv[5]);\ |
|
|
|
|
MIXWORD(chainv[2],chainv[6]);\ |
|
|
|
|
MIXWORD(chainv[3],chainv[7]);\ |
|
|
|
|
ADD_CONSTANT(chainv[0],chainv[4],c0,c1); |
|
|
|
|
ADD_CONSTANT(chainv[0],chainv[4],c0,c1);\ |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define SUBCRUMB(a0,a1,a2,a3,a4)\ |
|
|
|
|
a4 = a0;\ |
|
|
|
@ -104,7 +76,7 @@ typedef struct {
@@ -104,7 +76,7 @@ typedef struct {
|
|
|
|
|
|
|
|
|
|
/* initial values of chaining variables */ |
|
|
|
|
__constant__ uint32_t c_IV[40]; |
|
|
|
|
const uint32_t h2_IV[40] = { |
|
|
|
|
static const uint32_t h_IV[40] = { |
|
|
|
|
0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465, |
|
|
|
|
0x6e292011,0x90152df4,0xee058139,0xdef610bb, |
|
|
|
|
0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3, |
|
|
|
@ -117,7 +89,7 @@ const uint32_t h2_IV[40] = {
@@ -117,7 +89,7 @@ const uint32_t h2_IV[40] = {
|
|
|
|
|
0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea}; |
|
|
|
|
|
|
|
|
|
__constant__ uint32_t c_CNS[80]; |
|
|
|
|
uint32_t h2_CNS[80] = { |
|
|
|
|
static const uint32_t h_CNS[80] = { |
|
|
|
|
0x303994a6,0xe0337818,0xc0e65299,0x441ba90d, |
|
|
|
|
0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f, |
|
|
|
|
0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4, |
|
|
|
@ -142,213 +114,305 @@ uint32_t h2_CNS[80] = {
@@ -142,213 +114,305 @@ uint32_t h2_CNS[80] = {
|
|
|
|
|
|
|
|
|
|
/***************************************************/ |
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
void rnd512(hashState *state) |
|
|
|
|
void rnd512(uint32_t *statebuffer, uint32_t *statechainv) |
|
|
|
|
{ |
|
|
|
|
int i,j; |
|
|
|
|
uint32_t t[40]; |
|
|
|
|
uint32_t chainv[8]; |
|
|
|
|
uint32_t tmp; |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0; i<8; i++) { |
|
|
|
|
t[i]=0; |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
t[i] ^= state->chainv[i+8*j]; |
|
|
|
|
} |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) |
|
|
|
|
t[i] ^= statechainv[i+8*j]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
MULT2(t, 0); |
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[i+8*j] ^= t[i]; |
|
|
|
|
} |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
statechainv[i+8*j] ^= t[i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
t[i+8*j] = state->chainv[i+8*j]; |
|
|
|
|
} |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
t[i+8*j] = statechainv[i+8*j]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
MULT2(state->chainv, j); |
|
|
|
|
} |
|
|
|
|
#pragma unroll |
|
|
|
|
for(j=0; j<5; j++) |
|
|
|
|
MULT2(statechainv, j); |
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[8*j+i] ^= t[8*((j+1)%5)+i]; |
|
|
|
|
} |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
statechainv[8*j+i] ^= t[8*((j+1)%5)+i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
t[i+8*j] = state->chainv[i+8*j]; |
|
|
|
|
} |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
t[i+8*j] = statechainv[i+8*j]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
MULT2(state->chainv, j); |
|
|
|
|
} |
|
|
|
|
#pragma unroll |
|
|
|
|
for(j=0; j<5; j++) |
|
|
|
|
MULT2(statechainv, j); |
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[8*j+i] ^= t[8*((j+4)%5)+i]; |
|
|
|
|
} |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
statechainv[8*j+i] ^= t[8*((j+4)%5)+i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[i+8*j] ^= state->buffer[i]; |
|
|
|
|
} |
|
|
|
|
MULT2(state->buffer, 0); |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) { |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
statechainv[i+8*j] ^= statebuffer[i]; |
|
|
|
|
MULT2(statebuffer, 0); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
chainv[i] = state->chainv[i]; |
|
|
|
|
} |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
chainv[i] = statechainv[i]; |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]); |
|
|
|
|
} |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
STEP(c_CNS[(2*i)], c_CNS[(2*i)+1]); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[i] = chainv[i]; |
|
|
|
|
chainv[i] = state->chainv[i+8]; |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) { |
|
|
|
|
statechainv[i] = chainv[i]; |
|
|
|
|
chainv[i] = statechainv[i+8]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]); |
|
|
|
|
} |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
STEP(c_CNS[(2*i)+16], c_CNS[(2*i)+16+1]); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[i+8] = chainv[i]; |
|
|
|
|
chainv[i] = state->chainv[i+16]; |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) { |
|
|
|
|
statechainv[i+8] = chainv[i]; |
|
|
|
|
chainv[i] = statechainv[i+16]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[i+16] = chainv[i]; |
|
|
|
|
chainv[i] = state->chainv[i+24]; |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) { |
|
|
|
|
statechainv[i+16] = chainv[i]; |
|
|
|
|
chainv[i] = statechainv[i+24]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[i+24] = chainv[i]; |
|
|
|
|
chainv[i] = state->chainv[i+32]; |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) { |
|
|
|
|
statechainv[i+24] = chainv[i]; |
|
|
|
|
chainv[i] = statechainv[i+32]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
statechainv[i+32] = chainv[i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
static void rnd512_cpu(uint32_t *statebuffer, uint32_t *statechainv) |
|
|
|
|
{ |
|
|
|
|
int i, j; |
|
|
|
|
uint32_t t[40]; |
|
|
|
|
uint32_t chainv[8]; |
|
|
|
|
uint32_t tmp; |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) { |
|
|
|
|
t[i] = statechainv[i]; |
|
|
|
|
for (j = 1; j<5; j++) |
|
|
|
|
t[i] ^= statechainv[i + 8 * j]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
state->chainv[i+32] = chainv[i]; |
|
|
|
|
MULT2(t, 0); |
|
|
|
|
|
|
|
|
|
for (j = 0; j<5; j++) { |
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
statechainv[i + 8 * j] ^= t[i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (j = 0; j<5; j++) { |
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
t[i + 8 * j] = statechainv[i + 8 * j]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (j = 0; j<5; j++) |
|
|
|
|
MULT2(statechainv, j); |
|
|
|
|
|
|
|
|
|
for (j = 0; j<5; j++) { |
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (j = 0; j<5; j++) { |
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
t[i + 8 * j] = statechainv[i + 8 * j]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (j = 0; j<5; j++) |
|
|
|
|
MULT2(statechainv, j); |
|
|
|
|
|
|
|
|
|
for (j = 0; j<5; j++) { |
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (j = 0; j<5; j++) { |
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
statechainv[i + 8 * j] ^= statebuffer[i]; |
|
|
|
|
MULT2(statebuffer, 0); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
chainv[i] = statechainv[i]; |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
STEP(h_CNS[(2 * i)], h_CNS[(2 * i) + 1]); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) { |
|
|
|
|
statechainv[i] = chainv[i]; |
|
|
|
|
chainv[i] = statechainv[i + 8]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
STEP(h_CNS[(2 * i) + 16], h_CNS[(2 * i) + 16 + 1]); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) { |
|
|
|
|
statechainv[i + 8] = chainv[i]; |
|
|
|
|
chainv[i] = statechainv[i + 16]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
STEP(h_CNS[(2 * i) + 32], h_CNS[(2 * i) + 32 + 1]); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) { |
|
|
|
|
statechainv[i + 16] = chainv[i]; |
|
|
|
|
chainv[i] = statechainv[i + 24]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
STEP(h_CNS[(2 * i) + 48], h_CNS[(2 * i) + 48 + 1]); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) { |
|
|
|
|
statechainv[i + 24] = chainv[i]; |
|
|
|
|
chainv[i] = statechainv[i + 32]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
STEP(h_CNS[(2 * i) + 64], h_CNS[(2 * i) + 64 + 1]); |
|
|
|
|
|
|
|
|
|
for (i = 0; i<8; i++) |
|
|
|
|
statechainv[i + 32] = chainv[i]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/***************************************************/ |
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
void Update512(hashState *state, const BitSequence *data) |
|
|
|
|
void Update512(uint32_t* statebuffer, uint32_t *statechainv, const uint32_t *const __restrict__ data) |
|
|
|
|
{ |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]); |
|
|
|
|
rnd512(state); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]); |
|
|
|
|
rnd512(state); |
|
|
|
|
#pragma unroll 4 |
|
|
|
|
for(int i=0;i<4;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+64))[i]); |
|
|
|
|
#pragma unroll |
|
|
|
|
for (int i = 0; i<8; i++) |
|
|
|
|
statebuffer[i] = cuda_swab32((data[i])); |
|
|
|
|
rnd512(statebuffer, statechainv); |
|
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
|
for(int i=0; i<8; i++) |
|
|
|
|
statebuffer[i] = cuda_swab32((data[i+8])); |
|
|
|
|
rnd512(statebuffer, statechainv); |
|
|
|
|
|
|
|
|
|
#pragma unroll |
|
|
|
|
for(int i=0; i<4; i++) |
|
|
|
|
statebuffer[i] = cuda_swab32((data[i+16])); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/***************************************************/ |
|
|
|
|
__device__ __forceinline__ |
|
|
|
|
void finalization512(hashState *state, uint32_t *b) |
|
|
|
|
void finalization512(uint32_t* statebuffer, uint32_t *statechainv, uint32_t *b) |
|
|
|
|
{ |
|
|
|
|
int i,j; |
|
|
|
|
|
|
|
|
|
state->buffer[4] = 0x80000000; |
|
|
|
|
#pragma unroll 3 |
|
|
|
|
for(int i=5;i<8;i++) state->buffer[i] = 0; |
|
|
|
|
rnd512(state); |
|
|
|
|
statebuffer[4] = 0x80000000U; |
|
|
|
|
|
|
|
|
|
#pragma unroll 3 |
|
|
|
|
for(int i=5; i<8; i++) |
|
|
|
|
statebuffer[i] = 0; |
|
|
|
|
rnd512(statebuffer, statechainv); |
|
|
|
|
|
|
|
|
|
/*---- blank round with m=0 ----*/ |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) state->buffer[i] =0; |
|
|
|
|
rnd512(state); |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
statebuffer[i] =0; |
|
|
|
|
rnd512(statebuffer, statechainv); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) { |
|
|
|
|
b[i] = 0; |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
b[i] ^= state->chainv[i+8*j]; |
|
|
|
|
} |
|
|
|
|
b[i] = BYTES_SWAP32((b[i])); |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) |
|
|
|
|
b[i] ^= statechainv[i+8*j]; |
|
|
|
|
b[i] = cuda_swab32((b[i])); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) state->buffer[i]=0; |
|
|
|
|
rnd512(state); |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
statebuffer[i]=0; |
|
|
|
|
rnd512(statebuffer, statechainv); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(i=0;i<8;i++) { |
|
|
|
|
#pragma unroll |
|
|
|
|
for(i=0; i<8; i++) |
|
|
|
|
{ |
|
|
|
|
b[8+i] = 0; |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0;j<5;j++) { |
|
|
|
|
b[8+i] ^= state->chainv[i+8*j]; |
|
|
|
|
} |
|
|
|
|
b[8+i] = BYTES_SWAP32((b[8+i])); |
|
|
|
|
#pragma unroll 5 |
|
|
|
|
for(j=0; j<5; j++) |
|
|
|
|
b[8+i] ^= statechainv[i+8*j]; |
|
|
|
|
b[8+i] = cuda_swab32((b[8+i])); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/***************************************************/ |
|
|
|
|
// Die Hash-Funktion |
|
|
|
|
__global__ |
|
|
|
|
void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) |
|
|
|
|
void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash) |
|
|
|
|
{ |
|
|
|
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
|
if (thread < threads) |
|
|
|
@ -359,118 +423,37 @@ void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
@@ -359,118 +423,37 @@ void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
|
|
|
|
|
uint32_t buf32[32]; |
|
|
|
|
} buff; |
|
|
|
|
|
|
|
|
|
#pragma unroll 16 |
|
|
|
|
for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for (int i=8; i < 16; i++) |
|
|
|
|
buff.buf64[i] = c_PaddedMessage80[i]; |
|
|
|
|
|
|
|
|
|
// die Nounce durch die thread-spezifische ersetzen |
|
|
|
|
buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce)); |
|
|
|
|
|
|
|
|
|
uint32_t statebuffer[8], statechainv[40]; |
|
|
|
|
|
|
|
|
|
hashState state; |
|
|
|
|
#pragma unroll 40 |
|
|
|
|
for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(int i=0;i<8;i++) state.buffer[i] = 0; |
|
|
|
|
Update512(&state, (BitSequence*)buff.buf32); |
|
|
|
|
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; |
|
|
|
|
finalization512(&state, (uint32_t*)outHash); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__global__ |
|
|
|
|
void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) |
|
|
|
|
{ |
|
|
|
|
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
|
|
|
|
if (thread < threads) |
|
|
|
|
{ |
|
|
|
|
uint32_t nounce = startNounce + thread; |
|
|
|
|
union { |
|
|
|
|
uint64_t buf64[16]; |
|
|
|
|
uint32_t buf32[32]; |
|
|
|
|
} buff; |
|
|
|
|
uint32_t Hash[16]; |
|
|
|
|
|
|
|
|
|
#pragma unroll 16 |
|
|
|
|
for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i]; |
|
|
|
|
|
|
|
|
|
// Tested nonce |
|
|
|
|
buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce)); |
|
|
|
|
|
|
|
|
|
hashState state; |
|
|
|
|
#pragma unroll 40 |
|
|
|
|
for(int i=0;i<40;i++) state.chainv[i] = c_IV[i]; |
|
|
|
|
#pragma unroll |
|
|
|
|
for (int i = 0; i<4; i++) |
|
|
|
|
statebuffer[i] = cuda_swab32(buff.buf32[i + 16]); |
|
|
|
|
|
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for(int i=0;i<8;i++) state.buffer[i] = 0; |
|
|
|
|
#pragma unroll 4 |
|
|
|
|
for (int i = 4; i<8; i++) |
|
|
|
|
statebuffer[i] = statebufferpre[i]; |
|
|
|
|
|
|
|
|
|
Update512(&state, (BitSequence*)buff.buf32); |
|
|
|
|
finalization512(&state, Hash); |
|
|
|
|
#pragma unroll |
|
|
|
|
for (int i = 0; i<40; i++) |
|
|
|
|
statechainv[i] = statechainvpre[i]; |
|
|
|
|
|
|
|
|
|
/* dont ask me why not a simple if (Hash[i] > c_Target[i]) return; |
|
|
|
|
* we lose 20% in perfs without the position test */ |
|
|
|
|
int position = -1; |
|
|
|
|
#pragma unroll 8 |
|
|
|
|
for (int i = 7; i >= 0; i--) { |
|
|
|
|
if (Hash[i] > c_Target[i]) { |
|
|
|
|
if (position < i) { |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
if (Hash[i] < c_Target[i]) { |
|
|
|
|
if (position < i) { |
|
|
|
|
position = i; |
|
|
|
|
//break; /* impact perfs, unroll ? */ |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#if NBN == 1 |
|
|
|
|
if (resNounce[0] > nounce) { |
|
|
|
|
resNounce[0] = nounce; |
|
|
|
|
} |
|
|
|
|
#else |
|
|
|
|
/* keep the smallest nounce, + extra one if found */ |
|
|
|
|
if (resNounce[0] > nounce) { |
|
|
|
|
resNounce[1] = resNounce[0]; |
|
|
|
|
resNounce[0] = nounce; |
|
|
|
|
} else { |
|
|
|
|
resNounce[1] = nounce; |
|
|
|
|
} |
|
|
|
|
#endif |
|
|
|
|
uint32_t *outHash = &outputHash[thread * 16]; |
|
|
|
|
finalization512(statebuffer, statechainv, outHash); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__host__ |
|
|
|
|
void qubit_luffa512_cpu_init(int thr_id, uint32_t threads) |
|
|
|
|
{ |
|
|
|
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice)); |
|
|
|
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice)); |
|
|
|
|
CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], NBN * sizeof(uint32_t))); |
|
|
|
|
CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], NBN * sizeof(uint32_t))); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__host__ |
|
|
|
|
uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order) |
|
|
|
|
{ |
|
|
|
|
uint32_t result = UINT32_MAX; |
|
|
|
|
cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t)); |
|
|
|
|
const uint32_t threadsperblock = 256; |
|
|
|
|
|
|
|
|
|
dim3 grid((threads + threadsperblock-1)/threadsperblock); |
|
|
|
|
dim3 block(threadsperblock); |
|
|
|
|
|
|
|
|
|
size_t shared_size = 0; |
|
|
|
|
|
|
|
|
|
qubit_luffa512_gpu_finalhash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]); |
|
|
|
|
cudaThreadSynchronize(); |
|
|
|
|
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) { |
|
|
|
|
//cudaThreadSynchronize(); |
|
|
|
|
result = h_resNounce[thr_id][0]; |
|
|
|
|
#if NBN > 1 |
|
|
|
|
extra_results[0] = h_resNounce[thr_id][1]; |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
return result; |
|
|
|
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice)); |
|
|
|
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice)); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__host__ |
|
|
|
@ -486,22 +469,38 @@ void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
@@ -486,22 +469,38 @@ void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__host__ |
|
|
|
|
void qubit_luffa512_cpu_setBlock_80(void *pdata) |
|
|
|
|
void qubit_cpu_precalc(uint32_t* message) |
|
|
|
|
{ |
|
|
|
|
unsigned char PaddedMessage[128]; |
|
|
|
|
|
|
|
|
|
memcpy(PaddedMessage, pdata, 80); |
|
|
|
|
memset(PaddedMessage+80, 0, 48); |
|
|
|
|
PaddedMessage[80] = 0x80; |
|
|
|
|
PaddedMessage[111] = 1; |
|
|
|
|
PaddedMessage[126] = 0x02; |
|
|
|
|
PaddedMessage[127] = 0x80; |
|
|
|
|
|
|
|
|
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); |
|
|
|
|
uint32_t statebuffer[8]; |
|
|
|
|
uint32_t statechainv[40] = |
|
|
|
|
{ |
|
|
|
|
0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465, |
|
|
|
|
0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb, |
|
|
|
|
0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3, |
|
|
|
|
0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581, |
|
|
|
|
0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05, |
|
|
|
|
0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7, |
|
|
|
|
0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67, |
|
|
|
|
0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce, |
|
|
|
|
0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363, |
|
|
|
|
0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea |
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
for (int i = 0; i<8; i++) |
|
|
|
|
statebuffer[i] = cuda_swab32(message[i]); |
|
|
|
|
rnd512_cpu(statebuffer, statechainv); |
|
|
|
|
|
|
|
|
|
for (int i = 0; i<8; i++) |
|
|
|
|
statebuffer[i] = cuda_swab32(message[i+8]); |
|
|
|
|
|
|
|
|
|
rnd512_cpu(statebuffer, statechainv); |
|
|
|
|
|
|
|
|
|
cudaMemcpyToSymbol(statebufferpre, statebuffer, sizeof(statebuffer), 0, cudaMemcpyHostToDevice); |
|
|
|
|
cudaMemcpyToSymbol(statechainvpre, statechainv, sizeof(statechainv), 0, cudaMemcpyHostToDevice); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
__host__ |
|
|
|
|
void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget) |
|
|
|
|
void qubit_luffa512_cpu_setBlock_80(void *pdata) |
|
|
|
|
{ |
|
|
|
|
unsigned char PaddedMessage[128]; |
|
|
|
|
|
|
|
|
@ -512,6 +511,6 @@ void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)
@@ -512,6 +511,6 @@ void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)
|
|
|
|
|
PaddedMessage[126] = 0x02; |
|
|
|
|
PaddedMessage[127] = 0x80; |
|
|
|
|
|
|
|
|
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); |
|
|
|
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); |
|
|
|
|
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)); |
|
|
|
|
qubit_cpu_precalc((uint32_t*) PaddedMessage); |
|
|
|
|
} |
|
|
|
|