Browse Source

fix duplicates on skein2 and blake2s (nonce endian)

2upstream
Tanguy Pruvot 9 years ago
parent
commit
710d9292af
  1. 81
      Algo256/blake2s.cu
  2. 15
      skein2.cpp

81
Algo256/blake2s.cu

@ -28,7 +28,7 @@ static __thread blake2s_state ALIGN(A) s_ctx;
#endif #endif
#ifndef GPU_MIDSTATE #ifndef GPU_MIDSTATE
__constant__ uint32_t d_data[20]; __constant__ uint2 d_data[10];
#else #else
__constant__ blake2s_state ALIGN(8) d_state[1]; __constant__ blake2s_state ALIGN(8) d_state[1];
#endif #endif
@ -42,7 +42,9 @@ static uint32_t *h_resNonce[MAX_GPUS];
/* max count of found nonces in one call */ /* max count of found nonces in one call */
#define NBN 2 #define NBN 2
#if NBN > 1
static uint32_t extra_results[NBN] = { UINT32_MAX }; static uint32_t extra_results[NBN] = { UINT32_MAX };
#endif
extern "C" void blake2s_hash(void *output, const void *input) extern "C" void blake2s_hash(void *output, const void *input)
{ {
@ -196,6 +198,7 @@ void gpu_blake2s_compress(blake2s_state *S, const uint32_t *block)
#undef ROUND #undef ROUND
} }
#if 0
/* unused but kept as reference */ /* unused but kept as reference */
__device__ __forceinline__ __device__ __forceinline__
void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen) void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
@ -225,17 +228,21 @@ void gpu_blake2s_update(blake2s_state *S, const uint8_t *in, uint64_t inlen)
} }
} }
} }
#endif
#ifndef GPU_MIDSTATE
__device__ __forceinline__ __device__ __forceinline__
void gpu_blake2s_update76(blake2s_state *S, const void *input) void gpu_blake2s_fill_data(blake2s_state *S, const uint32_t nonce)
{ {
uint64_t *b64 = (uint64_t*) S->buf; uint2 *b2 = (uint2*) S->buf;
uint64_t *i64 = (uint64_t*) input;
#pragma unroll #pragma unroll
for (int i=0; i < 80/8; i++) for (int i=0; i < 9; i++)
b64[i] = i64[i]; b2[i] = d_data[i];
//S->buflen = 76; b2[9].x = d_data[9].x;
b2[9].y = nonce;
S->buflen = 80;
} }
#endif
__device__ __forceinline__ __device__ __forceinline__
void gpu_blake2s_update_nonce(blake2s_state *S, const uint32_t nonce) void gpu_blake2s_update_nonce(blake2s_state *S, const uint32_t nonce)
@ -245,7 +252,7 @@ void gpu_blake2s_update_nonce(blake2s_state *S, const uint32_t nonce)
} }
__device__ __forceinline__ __device__ __forceinline__
void gpu_blake2s_final(blake2s_state *S, uint32_t *out) uint2 gpu_blake2s_final(blake2s_state *S)
{ {
//if (S->buflen > BLAKE2S_BLOCKBYTES) //if (S->buflen > BLAKE2S_BLOCKBYTES)
{ {
@ -260,9 +267,10 @@ void gpu_blake2s_final(blake2s_state *S, uint32_t *out)
//memset(&S->buf[S->buflen], 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen); /* Padding */ //memset(&S->buf[S->buflen], 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen); /* Padding */
gpu_blake2s_compress(S, (uint32_t*) (S->buf + BLAKE2S_BLOCKBYTES)); gpu_blake2s_compress(S, (uint32_t*) (S->buf + BLAKE2S_BLOCKBYTES));
#pragma unroll //#pragma unroll
for (int i = 0; i < 8; i++) //for (int i = 0; i < 8; i++)
out[i] = S->h[i]; // out[i] = S->h[i];
return make_uint2(S->h[6], S->h[7]);
} }
/* init2 xors IV with input parameter block */ /* init2 xors IV with input parameter block */
@ -286,12 +294,13 @@ void gpu_blake2s_init_param(blake2s_state *S, const blake2s_param *P)
S->buflen = 0; S->buflen = 0;
#pragma unroll #pragma unroll
for (int i = 0; i < sizeof(S->buf)/8; i++) for (int i = 8; i < sizeof(S->buf)/8; i++)
gpu_store64(S->buf + (8*i), 0); gpu_store64(S->buf + (8*i), 0);
uint64_t *p = (uint64_t*) P; uint64_t *p = (uint64_t*) P;
/* IV XOR ParamBlock */ /* IV XOR ParamBlock */
#pragma unroll
for (int i = 0; i < 4; i++) for (int i = 0; i < 4; i++)
S->h[i] ^= gpu_load64(&p[i]); S->h[i] ^= gpu_load64(&p[i]);
} }
@ -333,25 +342,23 @@ void gpu_copystate(blake2s_state *dst, blake2s_state *src)
} }
__global__ __global__
void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2) void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2, const int swap)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint32_t nonce = startNonce + thread; const uint32_t nonce = swap ? cuda_swab32(startNonce + thread) : startNonce + thread;
blake2s_state ALIGN(8) blake2_ctx; blake2s_state ALIGN(8) blake2_ctx;
#ifndef GPU_MIDSTATE #ifndef GPU_MIDSTATE
gpu_blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES); gpu_blake2s_init(&blake2_ctx, BLAKE2S_OUTBYTES);
//gpu_blake2s_update(&blake2_ctx, (uint8_t*) d_data, 76); //gpu_blake2s_update(&blake2_ctx, (uint8_t*) d_data, 76);
gpu_blake2s_update76(&blake2_ctx, (uint64_t*) d_data); gpu_blake2s_fill_data(&blake2_ctx, nonce);
#else #else
gpu_copystate(&blake2_ctx, &d_state[0]); gpu_copystate(&blake2_ctx, &d_state[0]);
#endif
gpu_blake2s_update_nonce(&blake2_ctx, nonce); gpu_blake2s_update_nonce(&blake2_ctx, nonce);
#endif
uint32_t hash[8]; uint2 h2 = gpu_blake2s_final(&blake2_ctx);
gpu_blake2s_final(&blake2_ctx, hash); if (h2.y <= target2.y && h2.x <= target2.x) {
if (hash[7] <= target2.x && hash[6] <= target2.y) {
#if NBN == 2 #if NBN == 2
if (resNonce[0] != UINT32_MAX) if (resNonce[0] != UINT32_MAX)
resNonce[1] = nonce; resNonce[1] = nonce;
@ -363,8 +370,12 @@ void blake2s_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_
} }
} }
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
return iftrue ? swab32(val) : val;
}
__host__ __host__
uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2) uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, const int swap)
{ {
uint32_t result = UINT32_MAX; uint32_t result = UINT32_MAX;
@ -375,13 +386,15 @@ uint32_t blake2s_hash_cuda(const int thr_id, const uint32_t threads, const uint3
if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) if (cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
return result; return result;
blake2s_gpu_hash <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], target2); blake2s_gpu_hash <<<grid, block>>> (threads, startNonce, d_resNonce[thr_id], target2, swap);
cudaThreadSynchronize(); cudaThreadSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { if (cudaSuccess == cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
result = h_resNonce[thr_id][0]; result = swab32_if(h_resNonce[thr_id][0], swap);
#if NBN > 1
for (int n=0; n < (NBN-1); n++) for (int n=0; n < (NBN-1); n++)
extra_results[n] = h_resNonce[thr_id][n+1]; extra_results[n] = swab32_if(h_resNonce[thr_id][n+1], swap);
#endif
} }
return result; return result;
} }
@ -393,6 +406,7 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
uint32_t _ALIGN(64) endiandata[20]; uint32_t _ALIGN(64) endiandata[20];
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const int swap = 1; // to toggle nonce endian
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
@ -436,10 +450,10 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
blake2s_setBlock(endiandata, &s_midstate); blake2s_setBlock(endiandata, &s_midstate);
const uint2 target = make_uint2(ptarget[7], ptarget[6]); const uint2 target = make_uint2(ptarget[6], ptarget[7]);
do { do {
uint32_t foundNonce = blake2s_hash_cuda(thr_id, throughput, pdata[19], target); uint32_t foundNonce = blake2s_hash_cuda(thr_id, throughput, pdata[19], target, swap);
*hashes_done = pdata[19] - first_nonce + throughput; *hashes_done = pdata[19] - first_nonce + throughput;
@ -448,18 +462,18 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
uint32_t _ALIGN(A) vhashcpu[8]; uint32_t _ALIGN(A) vhashcpu[8];
//blake2s_hash(vhashcpu, endiandata); //blake2s_hash(vhashcpu, endiandata);
le32enc(&endiandata[19], foundNonce); endiandata[19] = swab32_if(foundNonce, swap);
blake2s_hash_end(vhashcpu, endiandata); blake2s_hash_end(vhashcpu, endiandata);
if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) { if (vhashcpu[7] <= target.y && fulltest(vhashcpu, ptarget)) {
work_set_target_ratio(work, vhashcpu); work_set_target_ratio(work, vhashcpu);
pdata[19] = work->nonces[0] = swab32(foundNonce); pdata[19] = work->nonces[0] = swab32_if(foundNonce, !swap);
#if NBN > 1 #if NBN > 1
if (extra_results[0] != UINT32_MAX) { if (extra_results[0] != UINT32_MAX) {
le32enc(&endiandata[19], extra_results[0]); endiandata[19] = swab32_if(extra_results[0], swap);
blake2s_hash_end(vhashcpu, endiandata); blake2s_hash_end(vhashcpu, endiandata);
if (vhashcpu[7] <= target.x && fulltest(vhashcpu, ptarget)) { if (vhashcpu[7] <= target.y && fulltest(vhashcpu, ptarget)) {
work->nonces[1] = swab32(extra_results[0]); work->nonces[1] = swab32_if(extra_results[0], !swap);
if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) { if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) {
work_set_target_ratio(work, vhashcpu); work_set_target_ratio(work, vhashcpu);
xchg(work->nonces[1], pdata[19]); xchg(work->nonces[1], pdata[19]);
@ -472,9 +486,6 @@ extern "C" int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonc
return 1; return 1;
} else { } else {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
applog_hex(pdata, 80);
applog_hex(ptarget, 32);
applog_hex(vhashcpu, 32);
} }
} }

15
skein2.cpp

@ -37,12 +37,17 @@ void skein2hash(void *output, const void *input)
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
return iftrue ? swab32(val) : val;
}
int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{ {
int dev_id = device_map[thr_id]; int dev_id = device_map[thr_id];
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
const int swap = 1; // to toggle nonce endian
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8 uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -81,7 +86,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
int order = 0; int order = 0;
// Hash with CUDA // Hash with CUDA
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 0); skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
*hashes_done = pdata[19] - first_nonce + throughput; *hashes_done = pdata[19] - first_nonce + throughput;
@ -91,7 +96,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
{ {
uint32_t _ALIGN(64) vhash64[8]; uint32_t _ALIGN(64) vhash64[8];
endiandata[19] = foundNonce; endiandata[19] = swab32_if(foundNonce, swap);
skein2hash(vhash64, endiandata); skein2hash(vhash64, endiandata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
@ -102,14 +107,14 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
if (!opt_quiet) if (!opt_quiet)
applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce)); applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce));
endiandata[19] = secNonce; endiandata[19] = swab32_if(secNonce, swap);
skein2hash(vhash64, endiandata); skein2hash(vhash64, endiandata);
if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
work_set_target_ratio(work, vhash64); work_set_target_ratio(work, vhash64);
pdata[21] = swab32(secNonce); pdata[21] = swab32_if(secNonce, !swap);
res++; res++;
} }
pdata[19] = swab32(foundNonce); pdata[19] = swab32_if(foundNonce, !swap);
return res; return res;
} else { } else {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);

Loading…
Cancel
Save