diff --git a/README.txt b/README.txt index 9e0954a..d3e5ae3 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.7.1 (Dec 2015) "Mining Diff & basic MultiAlgo" +ccMiner release 1.7.1 (Dec 2015) "Sibcoin & Whirlpool midstate" --------------------------------------------------------------- *************************************************************** @@ -235,6 +235,7 @@ features. Dec. 31th 2015 v1.7.1 Implement sib algo (X11 + Russian Streebog-512/GOST) + Whirlpool speed x2 with the midstate precompute Small bug fixes about device ids mapping (and vendor names) Nov. 06th 2015 v1.7 diff --git a/cuda_debug.cuh b/cuda_debug.cuh new file mode 100644 index 0000000..363dc1a --- /dev/null +++ b/cuda_debug.cuh @@ -0,0 +1,47 @@ +/** + * Helper to trace gpu computed data with --cputest + * + * Sample usage in an algo scan cuda unit : + * + * #define _DEBUG + * #define _DEBUG_PREFIX "x11-" + * #include "cuda_debug.cuh" + * + * TRACE64("luffa", d_hash); + * or + * TRACE("luffa") + * + * Dont forget to link the scan function in util.cpp (do_gpu_tests) + * + */ + +#include +//#include "cuda_helper.h" + +#ifndef _DEBUG_PREFIX +#define _DEBUG_PREFIX "" +#endif + +#ifdef _DEBUG +#define TRACE64(algo, d_buf) { \ + if (max_nonce == 1 && pdata[19] <= 1 && !opt_benchmark) { \ + uint32_t oft = 0; \ + uint32_t* debugbuf = NULL; \ + cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \ + cudaMemcpy(debugbuf, d_buf[thr_id] + oft, 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \ + printf(_DEBUG_PREFIX "%s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", \ + algo, \ + swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3]), \ + swab32(debugbuf[4]), swab32(debugbuf[5]), swab32(debugbuf[6]), swab32(debugbuf[7]), \ + swab32(debugbuf[8]), swab32(debugbuf[9]), swab32(debugbuf[10]),swab32(debugbuf[11]), \ + swab32(debugbuf[12]),swab32(debugbuf[13]),swab32(debugbuf[14]),swab32(debugbuf[15])); \ + cudaFreeHost(debugbuf); \ + } \ +} +#else +#define TRACE64(algo, d_buf) {} +#endif + +// simplified default +#define TRACE(algo) TRACE64(algo, d_hash) + diff --git a/x11/sib.cu b/x11/sib.cu index cc22521..ba61469 100644 --- a/x11/sib.cu +++ b/x11/sib.cu @@ -22,8 +22,6 @@ extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou #include #include -//#define _DEBUG - static uint32_t *d_hash[MAX_GPUS]; // Sibcoin CPU Hash @@ -71,7 +69,6 @@ extern "C" void sibhash(void *output, const void *input) sph_gost512_init(&ctx_gost); sph_gost512(&ctx_gost, (const void*) hash, 64); sph_gost512_close(&ctx_gost, (void*) hash); - //applog_hash64(hash); sph_luffa512_init(&ctx_luffa); sph_luffa512 (&ctx_luffa, (const void*) hash, 64); @@ -96,24 +93,9 @@ extern "C" void sibhash(void *output, const void *input) memcpy(output, hash, 32); } -#ifdef _DEBUG -#define TRACE(algo) { \ - if (max_nonce == 1 && pdata[19] <= 1 && !opt_benchmark) { \ - uint32_t oft = 0; \ - uint32_t* debugbuf = NULL; \ - cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \ - cudaMemcpy(debugbuf, d_hash[thr_id] + oft, 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \ - printf("SIB %s %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", algo, \ - swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3]), \ - swab32(debugbuf[4]), swab32(debugbuf[5]), swab32(debugbuf[6]), swab32(debugbuf[7]), \ - swab32(debugbuf[8]), swab32(debugbuf[9]), swab32(debugbuf[10]),swab32(debugbuf[11]), \ - swab32(debugbuf[12]),swab32(debugbuf[13]),swab32(debugbuf[14]),swab32(debugbuf[15])); \ - cudaFreeHost(debugbuf); \ - } \ -} -#else -#define TRACE(algo) {} -#endif +//#define _DEBUG +#define _DEBUG_PREFIX "sib" +#include "cuda_debug.cuh" static bool init[MAX_GPUS] = { 0 }; diff --git a/x11/x11.cu b/x11/x11.cu index 367aa00..2d040d3 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -87,20 +87,9 @@ extern "C" void x11hash(void *output, const void *input) memcpy(output, hash, 32); } -#ifdef _DEBUG -#define TRACE(algo) { \ - if (max_nonce == 1 && pdata[19] <= 1) { \ - uint32_t* debugbuf = NULL; \ - cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \ - cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \ - printf("X11 %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ - swab32(debugbuf[2]), swab32(debugbuf[3])); \ - cudaFreeHost(debugbuf); \ - } \ -} -#else -#define TRACE(algo) {} -#endif +//#define _DEBUG +#define _DEBUG_PREFIX "x11" +#include "cuda_debug.cuh" static bool init[MAX_GPUS] = { 0 }; diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu index baf3027..2636c41 100644 --- a/x15/cuda_x15_whirlpool.cu +++ b/x15/cuda_x15_whirlpool.cu @@ -1,43 +1,9 @@ -/* - * Built on cbuchner1's implementation, actual hashing code - * based on sphlib 3.0 - */ -#include -#include - -#define threadsperblock 256 - -//#define __DEV_STORAGE__ __constant__ -#define __DEV_STORAGE__ __device__ - -#include "cuda_helper.h" -extern __device__ __device_builtin__ void __threadfence_block(void); - -__DEV_STORAGE__ static uint64_t c_PaddedMessage80[16]; // input end block after midstate -__DEV_STORAGE__ static uint32_t pTarget[8]; - -static uint32_t *h_wnounce[MAX_GPUS] = { 0 }; -static uint32_t *d_WNonce[MAX_GPUS] = { 0 }; - -#define USE_ALL_TABLES 1 - -__DEV_STORAGE__ static uint64_t mixTob0Tox[256]; -#if USE_ALL_TABLES -__DEV_STORAGE__ static uint64_t mixTob1Tox[256]; -__DEV_STORAGE__ static uint64_t mixTob2Tox[256]; -__DEV_STORAGE__ static uint64_t mixTob3Tox[256]; -__DEV_STORAGE__ static uint64_t mixTob4Tox[256]; -__DEV_STORAGE__ static uint64_t mixTob5Tox[256]; -__DEV_STORAGE__ static uint64_t mixTob6Tox[256]; -__DEV_STORAGE__ static uint64_t mixTob7Tox[256]; -#endif - /** - * Whirlpool CUDA kernel implementation. + * Whirlpool-512 CUDA implementation. * * ==========================(LICENSE BEGIN)============================ * - * Copyright (c) 2014 djm34 & tpruvot & SP + * Copyright (c) 2014-2016 djm34, tpruvot, SP * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the @@ -59,10 +25,40 @@ __DEV_STORAGE__ static uint64_t mixTob7Tox[256]; * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * ===========================(LICENSE END)============================= - * @author djm34 - * @author tpruvot - * @author SP + * @author djm34 (initial draft) + * @author tpruvot (dual old/whirlpool modes, midstate) + * @author SP ("final" function opt and tuning) */ +#include +#include + +// don't change, used by shared mem fetch! +#define threadsperblock 256 + +#include "cuda_helper.h" +#include "miner.h" + +extern __device__ __device_builtin__ void __threadfence_block(void); + +__device__ static uint64_t c_PaddedMessage80[16]; +__device__ static uint32_t pTarget[8]; + +static uint32_t *h_wnounce[MAX_GPUS] = { 0 }; +static uint32_t *d_WNonce[MAX_GPUS] = { 0 }; + +#define HOST_MIDSTATE 1 +#define USE_ALL_TABLES 1 + +__constant__ static uint64_t mixTob0Tox[256]; +#if USE_ALL_TABLES +__constant__ static uint64_t mixTob1Tox[256]; +__constant__ static uint64_t mixTob2Tox[256]; +__constant__ static uint64_t mixTob3Tox[256]; +__constant__ static uint64_t mixTob4Tox[256]; +__constant__ static uint64_t mixTob5Tox[256]; +__constant__ static uint64_t mixTob6Tox[256]; +__constant__ static uint64_t mixTob7Tox[256]; +#endif static const uint64_t old1_T0[256] = { SPH_C64(0x78D8C07818281818), SPH_C64(0xAF2605AF23652323), @@ -2181,7 +2177,7 @@ static const uint64_t plain_T7[256] = { /** * Round constants. */ -__DEV_STORAGE__ uint64_t InitVector_RC[10]; +__device__ uint64_t InitVector_RC[10]; static const uint64_t plain_RC[10] = { SPH_C64(0x4F01B887E8C62318), @@ -2291,7 +2287,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int __global__ -void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) +void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab) { __shared__ uint64_t sharedMemory[2048]; @@ -2307,29 +2303,30 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; #endif } + __threadfence_block(); // ensure shared mem is ready uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = startNounce + thread; - union { - uint8_t h1[64]; - uint32_t h4[16]; - uint64_t h8[8]; - } hash; - uint64_t n[8]; uint64_t h[8]; + uint32_t nonce = startNounce + thread; + nonce = swab ? cuda_swab32(nonce) : nonce; +#if HOST_MIDSTATE + uint64_t state[8]; + #pragma unroll 8 + for (int i=0; i < 8; i++) { + state[i] = c_PaddedMessage80[i]; + } +#else #pragma unroll 8 for (int i=0; i<8; i++) { n[i] = c_PaddedMessage80[i]; // read data h[i] = 0; // read state } - __threadfence_block(); // ensure shared mem is ready - -// #pragma unroll 10 + #pragma unroll 1 for (unsigned r=0; r < 10; r++) { uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); @@ -2341,11 +2338,11 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp for (int i=0; i < 8; i++) { state[i] = xor1(n[i],c_PaddedMessage80[i]); } - +#endif /// round 2 /////// ////////////////////////////////// n[0] = c_PaddedMessage80[8]; //read data - n[1] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); //whirlpool + n[1] = REPLACE_HIDWORD(c_PaddedMessage80[9], nonce); //whirlpool n[2] = 0x0000000000000080; //whirlpool n[3] = 0; n[4] = 0; @@ -2359,7 +2356,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp n[i] = xor1(n[i],h[i]); } -// #pragma unroll 10 +// #pragma unroll for (unsigned r=0; r < 10; r++) { uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); @@ -2367,7 +2364,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp } state[0] = xor3(state[0], n[0], c_PaddedMessage80[8]); - state[1] = xor3(state[1], n[1], REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)) ); + state[1] = xor3(state[1], n[1], REPLACE_HIDWORD(c_PaddedMessage80[9], nonce) ); state[2] = xor3(state[2], n[2], 0x0000000000000080); state[3] = xor1(state[3], n[3]); state[4] = xor1(state[4], n[4]); @@ -2375,15 +2372,10 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp state[6] = xor1(state[6], n[6]); state[7] = xor3(state[7], n[7], 0x8002000000000000); + uint64_t* outHash = &(((uint64_t*)outputHash)[(size_t)8 * thread]); #pragma unroll 8 for (unsigned i = 0; i < 8; i++) - hash.h8[i] = state[i]; - - uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; - - #pragma unroll 16 - for (int i=0; i<16; i++) - outHash[i] = hash.h4[i]; + outHash[i] = state[i]; } // thread < threads } @@ -2405,6 +2397,7 @@ void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; #endif } + __threadfence_block(); // ensure shared mem is ready uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -2465,7 +2458,7 @@ void x15_whirlpool_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t } __global__ -void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint32_t *resNounce) +void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNounce) { __shared__ uint64_t sharedMemory[2048]; @@ -2482,14 +2475,13 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6 sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; #endif } + __threadfence_block(); // ensure shared mem is ready uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint64_t *inpHash = (uint64_t*) &g_hash[8 * hashPosition]; + uint32_t nonce = startNounce + thread; + uint64_t *inpHash = (uint64_t*) &g_hash[(size_t)8 * thread]; uint64_t h8[8]; #pragma unroll 8 @@ -2529,7 +2521,7 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6 n[i] = xor1(n[i], h[i]); } - #pragma unroll 10 +// #pragma unroll 10 for (unsigned r=0; r < 10; r++) { uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]); @@ -2546,8 +2538,8 @@ void oldwhirlpool_gpu_finalhash_64(uint32_t threads, uint32_t startNounce, uint6 state[7] = xor3(state[7], n[7], 0x2000000000000); bool rc = (state[3] <= ((uint64_t*)pTarget)[3]); - if (rc && resNounce[0] > nounce) - resNounce[0] = nounce; + if (rc && resNounce[0] > nonce) + resNounce[0] = nonce; } } @@ -2581,18 +2573,20 @@ extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode) cudaMemcpyToSymbol(mixTob6Tox, old1_T6, (256*8), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(mixTob7Tox, old1_T7, (256*8), 0, cudaMemcpyHostToDevice); #endif + cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t)); + cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t)); break; } - - cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t)); - cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t)); } __host__ extern void x15_whirlpool_cpu_free(int thr_id) { - cudaFree(d_WNonce[thr_id]); - cudaFreeHost(h_wnounce[thr_id]); + if (h_wnounce[thr_id]) { + cudaFree(d_WNonce[thr_id]); + cudaFreeHost(h_wnounce[thr_id]); + h_wnounce[thr_id] = NULL; + } } __host__ @@ -2614,13 +2608,10 @@ extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 block(threadsperblock); - size_t shared_size = 0; - cudaMemset(d_WNonce[thr_id], 0xff, sizeof(uint32_t)); - oldwhirlpool_gpu_finalhash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]); + oldwhirlpool_gpu_finalhash_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_WNonce[thr_id]); - MyStreamSynchronize(NULL, order, thr_id); cudaMemcpy(h_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); result = *h_wnounce[thr_id]; @@ -2634,18 +2625,30 @@ void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 block(threadsperblock); - oldwhirlpool_gpu_hash_80 <<>> (threads, startNounce, d_outputHash); + if (threads < 256) + applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!"); - MyStreamSynchronize(NULL, order, thr_id); + oldwhirlpool_gpu_hash_80<<>>(threads, startNounce, d_outputHash, 1); } +extern void whirl_midstate(void *state, const void *input); + __host__ void whirlpool512_setBlock_80(void *pdata, const void *ptarget) { unsigned char PaddedMessage[128]; + memcpy(PaddedMessage, pdata, 80); memset(PaddedMessage+80, 0, 48); PaddedMessage[80] = 0x80; /* ending */ - cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + +#if HOST_MIDSTATE + // compute constant first block + unsigned char midstate[64] = { 0 }; + whirl_midstate(midstate, pdata); + memcpy(PaddedMessage, midstate, 64); +#endif + + cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice); } diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu index 4769baa..44a0f2b 100644 --- a/x15/whirlpool.cu +++ b/x15/whirlpool.cu @@ -19,6 +19,9 @@ extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget); extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +//#define _DEBUG +#define _DEBUG_PREFIX "whirl" +#include "cuda_debug.cuh" // CPU Hash function extern "C" void wcoinhash(void *state, const void *input) @@ -49,6 +52,16 @@ extern "C" void wcoinhash(void *state, const void *input) memcpy(state, hash, 32); } +void whirl_midstate(void *state, const void *input) +{ + sph_whirlpool_context ctx; + + sph_whirlpool1_init(&ctx); + sph_whirlpool1(&ctx, input, 64); + + memcpy(state, ctx.state, 64); +} + static bool init[MAX_GPUS] = { 0 }; extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) @@ -60,6 +73,7 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8; if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + if (init[thr_id]) throughput = max(throughput, 256); // shared mem requirement if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; @@ -91,8 +105,11 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, *hashes_done = pdata[19] - first_nonce + throughput; whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE64(" 80 :", d_hash); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE64(" 64 :", d_hash); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE64(" 64 :", d_hash); foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != UINT32_MAX && bench_algo < 0) @@ -115,7 +132,7 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, pdata[19] = foundNonce; return res; } else { - applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); } } if ((uint64_t) throughput + pdata[19] >= max_nonce) {