Browse Source

whirlpool midstate and debug/trace defines

+ new cuda_debug.cuh include to trace gpu data

Happy new year!

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
2upstream
Tanguy Pruvot 9 years ago
parent
commit
76a22479b1
  1. 3
      README.txt
  2. 47
      cuda_debug.cuh
  3. 24
      x11/sib.cu
  4. 17
      x11/x11.cu
  5. 169
      x15/cuda_x15_whirlpool.cu
  6. 19
      x15/whirlpool.cu

3
README.txt

@ -1,5 +1,5 @@
ccMiner release 1.7.1 (Dec 2015) "Mining Diff & basic MultiAlgo" ccMiner release 1.7.1 (Dec 2015) "Sibcoin & Whirlpool midstate"
--------------------------------------------------------------- ---------------------------------------------------------------
*************************************************************** ***************************************************************
@ -235,6 +235,7 @@ features.
Dec. 31th 2015 v1.7.1 Dec. 31th 2015 v1.7.1
Implement sib algo (X11 + Russian Streebog-512/GOST) Implement sib algo (X11 + Russian Streebog-512/GOST)
Whirlpool speed x2 with the midstate precompute
Small bug fixes about device ids mapping (and vendor names) Small bug fixes about device ids mapping (and vendor names)
Nov. 06th 2015 v1.7 Nov. 06th 2015 v1.7

47
cuda_debug.cuh

@ -0,0 +1,47 @@
/**
* Helper to trace gpu computed data with --cputest
*
* Sample usage in an algo scan cuda unit :
*
* #define _DEBUG
* #define _DEBUG_PREFIX "x11-"
* #include "cuda_debug.cuh"
*
* TRACE64("luffa", d_hash);
* or
* TRACE("luffa")
*
* Dont forget to link the scan function in util.cpp (do_gpu_tests)
*
*/
#include <stdio.h>
//#include "cuda_helper.h"
#ifndef _DEBUG_PREFIX
#define _DEBUG_PREFIX ""
#endif
#ifdef _DEBUG
#define TRACE64(algo, d_buf) { \
if (max_nonce == 1 && pdata[19] <= 1 && !opt_benchmark) { \
uint32_t oft = 0; \
uint32_t* debugbuf = NULL; \
cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \
cudaMemcpy(debugbuf, d_buf[thr_id] + oft, 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
printf(_DEBUG_PREFIX "%s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", \
algo, \
swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3]), \
swab32(debugbuf[4]), swab32(debugbuf[5]), swab32(debugbuf[6]), swab32(debugbuf[7]), \
swab32(debugbuf[8]), swab32(debugbuf[9]), swab32(debugbuf[10]),swab32(debugbuf[11]), \
swab32(debugbuf[12]),swab32(debugbuf[13]),swab32(debugbuf[14]),swab32(debugbuf[15])); \
cudaFreeHost(debugbuf); \
} \
}
#else
#define TRACE64(algo, d_buf) {}
#endif
// simplified default
#define TRACE(algo) TRACE64(algo, d_hash)

24
x11/sib.cu

@ -22,8 +22,6 @@ extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
//#define _DEBUG
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
// Sibcoin CPU Hash // Sibcoin CPU Hash
@ -71,7 +69,6 @@ extern "C" void sibhash(void *output, const void *input)
sph_gost512_init(&ctx_gost); sph_gost512_init(&ctx_gost);
sph_gost512(&ctx_gost, (const void*) hash, 64); sph_gost512(&ctx_gost, (const void*) hash, 64);
sph_gost512_close(&ctx_gost, (void*) hash); sph_gost512_close(&ctx_gost, (void*) hash);
//applog_hash64(hash);
sph_luffa512_init(&ctx_luffa); sph_luffa512_init(&ctx_luffa);
sph_luffa512 (&ctx_luffa, (const void*) hash, 64); sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
@ -96,24 +93,9 @@ extern "C" void sibhash(void *output, const void *input)
memcpy(output, hash, 32); memcpy(output, hash, 32);
} }
#ifdef _DEBUG //#define _DEBUG
#define TRACE(algo) { \ #define _DEBUG_PREFIX "sib"
if (max_nonce == 1 && pdata[19] <= 1 && !opt_benchmark) { \ #include "cuda_debug.cuh"
uint32_t oft = 0; \
uint32_t* debugbuf = NULL; \
cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \
cudaMemcpy(debugbuf, d_hash[thr_id] + oft, 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
printf("SIB %s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", algo, \
swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3]), \
swab32(debugbuf[4]), swab32(debugbuf[5]), swab32(debugbuf[6]), swab32(debugbuf[7]), \
swab32(debugbuf[8]), swab32(debugbuf[9]), swab32(debugbuf[10]),swab32(debugbuf[11]), \
swab32(debugbuf[12]),swab32(debugbuf[13]),swab32(debugbuf[14]),swab32(debugbuf[15])); \
cudaFreeHost(debugbuf); \
} \
}
#else
#define TRACE(algo) {}
#endif
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };

17
x11/x11.cu

@ -87,20 +87,9 @@ extern "C" void x11hash(void *output, const void *input)
memcpy(output, hash, 32); memcpy(output, hash, 32);
} }
#ifdef _DEBUG //#define _DEBUG
#define TRACE(algo) { \ #define _DEBUG_PREFIX "x11"
if (max_nonce == 1 && pdata[19] <= 1) { \ #include "cuda_debug.cuh"
uint32_t* debugbuf = NULL; \
cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
printf("X11 %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
swab32(debugbuf[2]), swab32(debugbuf[3])); \
cudaFreeHost(debugbuf); \
} \
}
#else
#define TRACE(algo) {}
#endif
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };

169
x15/cuda_x15_whirlpool.cu

@ -1,43 +1,9 @@
/*
* Built on cbuchner1's implementation, actual hashing code
* based on sphlib 3.0
*/
#include <stdio.h>
#include <memory.h>
#define threadsperblock 256
//#define __DEV_STORAGE__ __constant__
#define __DEV_STORAGE__ __device__
#include "cuda_helper.h"
extern __device__ __device_builtin__ void __threadfence_block(void);
__DEV_STORAGE__ static uint64_t c_PaddedMessage80[16]; // input end block after midstate
__DEV_STORAGE__ static uint32_t pTarget[8];
static uint32_t *h_wnounce[MAX_GPUS] = { 0 };
static uint32_t *d_WNonce[MAX_GPUS] = { 0 };
#define USE_ALL_TABLES 1
__DEV_STORAGE__ static uint64_t mixTob0Tox[256];
#if USE_ALL_TABLES
__DEV_STORAGE__ static uint64_t mixTob1Tox[256];
__DEV_STORAGE__ static uint64_t mixTob2Tox[256];
__DEV_STORAGE__ static uint64_t mixTob3Tox[256];
__DEV_STORAGE__ static uint64_t mixTob4Tox[256];
__DEV_STORAGE__ static uint64_t mixTob5Tox[256];
__DEV_STORAGE__ static uint64_t mixTob6Tox[256];
__DEV_STORAGE__ static uint64_t mixTob7Tox[256];
#endif
/** /**
* Whirlpool CUDA kernel implementation. * Whirlpool-512 CUDA implementation.
* *
* ==========================(LICENSE BEGIN)============================ * ==========================(LICENSE BEGIN)============================
* *
* Copyright (c) 2014 djm34 & tpruvot & SP * Copyright (c) 2014-2016 djm34, tpruvot, SP
* *
* Permission is hereby granted, free of charge, to any person obtaining * Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the * a copy of this software and associated documentation files (the
@ -59,10 +25,40 @@ __DEV_STORAGE__ static uint64_t mixTob7Tox[256];
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
* *
* ===========================(LICENSE END)============================= * ===========================(LICENSE END)=============================
* @author djm34 * @author djm34 (initial draft)
* @author tpruvot * @author tpruvot (dual old/whirlpool modes, midstate)
* @author SP * @author SP ("final" function opt and tuning)
*/ */
#include <stdio.h>
#include <memory.h>
// don't change, used by shared mem fetch!
#define threadsperblock 256
#include "cuda_helper.h"
#include "miner.h"
extern __device__ __device_builtin__ void __threadfence_block(void);
__device__ static uint64_t c_PaddedMessage80[16];
__device__ static uint32_t pTarget[8];
static uint32_t *h_wnounce[MAX_GPUS] = { 0 };
static uint32_t *d_WNonce[MAX_GPUS] = { 0 };
#define HOST_MIDSTATE 1
#define USE_ALL_TABLES 1
__constant__ static uint64_t mixTob0Tox[256];
#if USE_ALL_TABLES
__constant__ static uint64_t mixTob1Tox[256];
__constant__ static uint64_t mixTob2Tox[256];
__constant__ static uint64_t mixTob3Tox[256];
__constant__ static uint64_t mixTob4Tox[256];
__constant__ static uint64_t mixTob5Tox[256];
__constant__ static uint64_t mixTob6Tox[256];
__constant__ static uint64_t mixTob7Tox[256];
#endif
static const uint64_t old1_T0[256] = { static const uint64_t old1_T0[256] = {
SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323), SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323),
@ -2181,7 +2177,7 @@ static const uint64_t plain_T7[256] = {
/** /**
* Round constants. * Round constants.
*/ */
__DEV_STORAGE__ uint64_t InitVector_RC[10]; __device__ uint64_t InitVector_RC[10];
static const uint64_t plain_RC[10] = { static const uint64_t plain_RC[10] = {
SPH_C64(0x4F01B887E8C62318), SPH_C64(0x4F01B887E8C62318),
@ -2291,7 +2287,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
__global__ __global__
void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab)
{ {
__shared__ uint64_t sharedMemory[2048]; __shared__ uint64_t sharedMemory[2048];
@ -2307,29 +2303,30 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
#endif #endif
} }
__threadfence_block(); // ensure shared mem is ready
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = startNounce + thread;
union {
uint8_t h1[64];
uint32_t h4[16];
uint64_t h8[8];
} hash;
uint64_t n[8]; uint64_t n[8];
uint64_t h[8]; uint64_t h[8];
uint32_t nonce = startNounce + thread;
nonce = swab ? cuda_swab32(nonce) : nonce;
#if HOST_MIDSTATE
uint64_t state[8];
#pragma unroll 8
for (int i=0; i < 8; i++) {
state[i] = c_PaddedMessage80[i];
}
#else
#pragma unroll 8 #pragma unroll 8
for (int i=0; i<8; i++) { for (int i=0; i<8; i++) {
n[i] = c_PaddedMessage80[i]; // read data n[i] = c_PaddedMessage80[i]; // read data
h[i] = 0; // read state h[i] = 0; // read state
} }
__threadfence_block(); // ensure shared mem is ready #pragma unroll 1
// #pragma unroll 10
for (unsigned r=0; r < 10; r++) { for (unsigned r=0; r < 10; r++) {
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
@ -2341,11 +2338,11 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
for (int i=0; i < 8; i++) { for (int i=0; i < 8; i++) {
state[i] = xor1(n[i],c_PaddedMessage80[i]); state[i] = xor1(n[i],c_PaddedMessage80[i]);
} }
#endif
/// round 2 /////// /// round 2 ///////
////////////////////////////////// //////////////////////////////////
n[0] = c_PaddedMessage80[8]; //read data n[0] = c_PaddedMessage80[8]; //read data
n[1] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); //whirlpool n[1] = REPLACE_HIDWORD(c_PaddedMessage80[9], nonce); //whirlpool
n[2] = 0x0000000000000080; //whirlpool n[2] = 0x0000000000000080; //whirlpool
n[3] = 0; n[3] = 0;
n[4] = 0; n[4] = 0;
@ -2359,7 +2356,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
n[i] = xor1(n[i],h[i]); n[i] = xor1(n[i],h[i]);
} }
// #pragma unroll 10 // #pragma unroll
for (unsigned r=0; r < 10; r++) { for (unsigned r=0; r < 10; r++) {
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
@ -2367,7 +2364,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
} }
state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]); state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]);
state[1] = xor3(state[1], n[1], REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)) ); state[1] = xor3(state[1], n[1], REPLACE_HIDWORD(c_PaddedMessage80[9], nonce) );
state[2] = xor3(state[2], n[2], 0x0000000000000080); state[2] = xor3(state[2], n[2], 0x0000000000000080);
state[3] = xor1(state[3], n[3]); state[3] = xor1(state[3], n[3]);
state[4] = xor1(state[4], n[4]); state[4] = xor1(state[4], n[4]);
@ -2375,15 +2372,10 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
state[6] = xor1(state[6], n[6]); state[6] = xor1(state[6], n[6]);
state[7] = xor3(state[7], n[7], 0x8002000000000000); state[7] = xor3(state[7], n[7], 0x8002000000000000);
uint64_t* outHash = &(((uint64_t*)outputHash)[(size_t)8 * thread]);
#pragma unroll 8 #pragma unroll 8
for (unsigned i = 0; i < 8; i++) for (unsigned i = 0; i < 8; i++)
hash.h8[i] = state[i]; outHash[i] = state[i];
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 16
for (int i=0; i<16; i++)
outHash[i] = hash.h4[i];
} // thread < threads } // thread < threads
} }
@ -2405,6 +2397,7 @@ void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
#endif #endif
} }
__threadfence_block(); // ensure shared mem is ready
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -2465,7 +2458,7 @@ void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
} }
__global__ __global__
void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce) void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNounce)
{ {
__shared__ uint64_t sharedMemory[2048]; __shared__ uint64_t sharedMemory[2048];
@ -2482,14 +2475,13 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
#endif #endif
} }
__threadfence_block(); // ensure shared mem is ready
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); uint32_t nonce = startNounce + thread;
uint64_t *inpHash = (uint64_t*) &g_hash[(size_t)8 * thread];
int hashPosition = nounce - startNounce;
uint64_t *inpHash = (uint64_t*) &g_hash[8 * hashPosition];
uint64_t h8[8]; uint64_t h8[8];
#pragma unroll 8 #pragma unroll 8
@ -2529,7 +2521,7 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6
n[i] = xor1(n[i], h[i]); n[i] = xor1(n[i], h[i]);
} }
#pragma unroll 10 // #pragma unroll 10
for (unsigned r=0; r < 10; r++) { for (unsigned r=0; r < 10; r++) {
uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
@ -2546,8 +2538,8 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6
state[7] = xor3(state[7], n[7], 0x2000000000000); state[7] = xor3(state[7], n[7], 0x2000000000000);
bool rc = (state[3] <= ((uint64_t*)pTarget)[3]); bool rc = (state[3] <= ((uint64_t*)pTarget)[3]);
if (rc && resNounce[0] > nounce) if (rc && resNounce[0] > nonce)
resNounce[0] = nounce; resNounce[0] = nonce;
} }
} }
@ -2581,18 +2573,20 @@ extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode)
cudaMemcpyToSymbol(mixTob6Tox, old1_T6, (256*8), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(mixTob6Tox, old1_T6, (256*8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob7Tox, old1_T7, (256*8), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(mixTob7Tox, old1_T7, (256*8), 0, cudaMemcpyHostToDevice);
#endif #endif
cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t));
cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t));
break; break;
} }
cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t));
cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t));
} }
__host__ __host__
extern void x15_whirlpool_cpu_free(int thr_id) extern void x15_whirlpool_cpu_free(int thr_id)
{ {
cudaFree(d_WNonce[thr_id]); if (h_wnounce[thr_id]) {
cudaFreeHost(h_wnounce[thr_id]); cudaFree(d_WNonce[thr_id]);
cudaFreeHost(h_wnounce[thr_id]);
h_wnounce[thr_id] = NULL;
}
} }
__host__ __host__
@ -2614,13 +2608,10 @@ extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint
dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 grid((threads + threadsperblock-1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
size_t shared_size = 0;
cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t)); cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t));
oldwhirlpool_gpu_finalhash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]); oldwhirlpool_gpu_finalhash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_WNonce[thr_id]);
MyStreamSynchronize(NULL, order, thr_id);
cudaMemcpy(h_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(h_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
result = *h_wnounce[thr_id]; result = *h_wnounce[thr_id];
@ -2634,18 +2625,30 @@ void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce
dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 grid((threads + threadsperblock-1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNounce, d_outputHash); if (threads < 256)
applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
MyStreamSynchronize(NULL, order, thr_id); oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash, 1);
} }
extern void whirl_midstate(void *state, const void *input);
__host__ __host__
void whirlpool512_setBlock_80(void *pdata, const void *ptarget) void whirlpool512_setBlock_80(void *pdata, const void *ptarget)
{ {
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
memcpy(PaddedMessage, pdata, 80); memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage+80, 0, 48); memset(PaddedMessage+80, 0, 48);
PaddedMessage[80] = 0x80; /* ending */ PaddedMessage[80] = 0x80; /* ending */
cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); #if HOST_MIDSTATE
// compute constant first block
unsigned char midstate[64] = { 0 };
whirl_midstate(midstate, pdata);
memcpy(PaddedMessage, midstate, 64);
#endif
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
} }

19
x15/whirlpool.cu

@ -19,6 +19,9 @@ extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
//#define _DEBUG
#define _DEBUG_PREFIX "whirl"
#include "cuda_debug.cuh"
// CPU Hash function // CPU Hash function
extern "C" void wcoinhash(void *state, const void *input) extern "C" void wcoinhash(void *state, const void *input)
@ -49,6 +52,16 @@ extern "C" void wcoinhash(void *state, const void *input)
memcpy(state, hash, 32); memcpy(state, hash, 32);
} }
void whirl_midstate(void *state, const void *input)
{
sph_whirlpool_context ctx;
sph_whirlpool1_init(&ctx);
sph_whirlpool1(&ctx, input, 64);
memcpy(state, ctx.state, 64);
}
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
@ -60,6 +73,7 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (init[thr_id]) throughput = max(throughput, 256); // shared mem requirement
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;
@ -91,8 +105,11 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
*hashes_done = pdata[19] - first_nonce + throughput; *hashes_done = pdata[19] - first_nonce + throughput;
whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE64(" 80 :", d_hash);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE64(" 64 :", d_hash);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE64(" 64 :", d_hash);
foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != UINT32_MAX && bench_algo < 0) if (foundNonce != UINT32_MAX && bench_algo < 0)
@ -115,7 +132,7 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
pdata[19] = foundNonce; pdata[19] = foundNonce;
return res; return res;
} else { } else {
applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
} }
} }
if ((uint64_t) throughput + pdata[19] >= max_nonce) { if ((uint64_t) throughput + pdata[19] >= max_nonce) {

Loading…
Cancel
Save