From c17d11e37758c37762a7664a731fda6e9a5454b1 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 31 Aug 2014 08:57:48 +0200 Subject: [PATCH 01/44] add "blake" 256, 14 rounds (for NEOS blake, not BlakeCoin) also remove "missing" file, its old and not compatible with ubuntu 14.04 --- Makefile.am | 3 +- blake32.cu | 494 ++++++++++++++++++++++++++++++++++++++++ build.sh | 2 +- ccminer.vcxproj | 6 +- ccminer.vcxproj.filters | 3 + configure.sh | 2 +- cpu-miner.c | 48 +++- miner.h | 5 + missing | 367 ----------------------------- quark/cuda_checkhash.cu | 4 +- util.c | 4 + 11 files changed, 554 insertions(+), 384 deletions(-) create mode 100644 blake32.cu delete mode 100644 missing diff --git a/Makefile.am b/Makefile.am index 5e539a9..3935afe 100644 --- a/Makefile.am +++ b/Makefile.am @@ -32,7 +32,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \ quark/cuda_quark_compactionTest.cu \ - cuda_nist5.cu \ + cuda_nist5.cu blake32.cu \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ @@ -43,6 +43,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu x15/whirlpool.cu \ x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu + ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME diff --git a/blake32.cu b/blake32.cu new file mode 100644 index 0000000..882594f --- /dev/null +++ b/blake32.cu @@ -0,0 +1,494 @@ +/** + * Blake-256 Cuda Kernel (Tested on SM 5.0) + * + * Tanguy Pruvot - Aug. 2014 + */ + +#include "miner.h" + +extern "C" { +#include "sph/sph_blake.h" +#include +#include +} + +/* hash by cpu with blake 256 */ +extern "C" void blake32hash(void *output, const void *input) +{ + unsigned char hash[64]; + sph_blake256_context ctx; + sph_blake256_init(&ctx); + sph_blake256(&ctx, input, 80); + sph_blake256_close(&ctx, hash); + memcpy(output, hash, 32); +} + +#include "cuda_helper.h" + +#if __CUDA_ARCH__ < 350 + // Kepler (Compute 3.0) + Host + #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#else + // Kepler (Compute 3.5 / 5.0) + #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) +#endif + +// in cpu-miner.c +extern bool opt_benchmark; +extern bool opt_debug; +extern int device_map[8]; + +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + +// shared for 8 threads of addresses (cudaMalloc) +uint32_t* d_hash[8]; + +__constant__ +static uint32_t pTarget[8]; + +__constant__ +static uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) +static uint32_t *d_resNounce[8]; +static uint32_t *h_resNounce[8]; + +__constant__ +static uint8_t c_sigma[16][16]; +const uint8_t host_sigma[16][16] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +__device__ __constant__ +static const uint32_t c_IV256[8] = { + SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), + SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), + SPH_C32(0x510E527F), SPH_C32(0x9B05688C), + SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) +}; + +__device__ __constant__ + +static const uint32_t c_u256[16] = { + SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), + SPH_C32(0x13198A2E), SPH_C32(0x03707344), + SPH_C32(0xA4093822), SPH_C32(0x299F31D0), + SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), + SPH_C32(0x452821E6), SPH_C32(0x38D01377), + SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), + SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), + SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) +}; + +#if 0 +#define GS(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T32(a + b + (m0 ^ c1)); \ + d = SPH_ROTR32(d ^ a, 16); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 12); \ + a = SPH_T32(a + b + (m1 ^ c0)); \ + d = SPH_ROTR32(d ^ a, 8); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 7); \ + } while (0) + +#define ROUND_S(r) do { \ + GS(Mx(r, 0x0), Mx(r, 0x1), CSx(r, 0x0), CSx(r, 0x1), v[0], v[4], v[0x8], v[0xC]); \ + GS(Mx(r, 0x2), Mx(r, 0x3), CSx(r, 0x2), CSx(r, 0x3), v[1], v[5], v[0x9], v[0xD]); \ + GS(Mx(r, 0x4), Mx(r, 0x5), CSx(r, 0x4), CSx(r, 0x5), v[2], v[6], v[0xA], v[0xE]); \ + GS(Mx(r, 0x6), Mx(r, 0x7), CSx(r, 0x6), CSx(r, 0x7), v[3], v[7], v[0xB], v[0xF]); \ + GS(Mx(r, 0x8), Mx(r, 0x9), CSx(r, 0x8), CSx(r, 0x9), v[0], v[5], v[0xA], v[0xF]); \ + GS(Mx(r, 0xA), Mx(r, 0xB), CSx(r, 0xA), CSx(r, 0xB), v[1], v[6], v[0xB], v[0xC]); \ + GS(Mx(r, 0xC), Mx(r, 0xD), CSx(r, 0xC), CSx(r, 0xD), v[2], v[7], v[0x8], v[0xD]); \ + GS(Mx(r, 0xE), Mx(r, 0xF), CSx(r, 0xE), CSx(r, 0xF), v[3], v[4], v[0x9], v[0xE]); \ +} while (0) +#endif + +#define GS(a,b,c,d,e) { \ + v[a] += (m[sigma[i][e]] ^ u256[sigma[i][e+1]]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 12); \ +\ + v[a] += (m[sigma[i][e+1]] ^ u256[sigma[i][e]]) + v[b]; \ + v[d] = ROTR32(v[d] ^ v[a], 8); \ + v[c] += v[d]; \ + v[b] = ROTR32(v[b] ^ v[c], 7); \ +} + +__device__ static +void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), const uint32_t *u256, const uint32_t T0, uint8_t nullt = 1) +{ + uint32_t /* __align__(8) */ v[16]; + uint32_t /* __align__(8) */ m[16]; + + //#pragma unroll + for (int i = 0; i < 16; ++i) { + m[i] = cuda_swab32(block[i]); + //m[i] = block[i]; + } + + #pragma unroll + for(int i = 0; i < 8; i++) + v[i] = h[i]; + + v[ 8] = u256[0]; + v[ 9] = u256[1]; + v[10] = u256[2]; + v[11] = u256[3]; + + v[12] = u256[4] ^ T0; + v[13] = u256[5] ^ T0; + v[14] = u256[6]; + v[15] = u256[7]; + + // on a 80-bytes null buffer : + // first : v = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, ...} + // second : v = {0xb5bfb2f9, 0x14cfcc63, 0xb85c549c, 0xc9b4184e, ..., 0x299f3350, 0x082efa98, 0xec4e6c89} + + //#pragma unroll + for (int i = 0; i < 14; i++) { + /* column step */ + GS(0, 4, 0x8, 0xC, 0); + GS(1, 5, 0x9, 0xD, 2); + GS(2, 6, 0xA, 0xE, 4); + GS(3, 7, 0xB, 0xF, 6); + /* diagonal step */ + GS(0, 5, 0xA, 0xF, 0x8); + GS(1, 6, 0xB, 0xC, 0xA); + GS(2, 7, 0x8, 0xD, 0xC); + GS(3, 4, 0x9, 0xE, 0xE); + } + + //#pragma unroll 16 + for(int i = 0; i < 16; i++) + h[i % 8] ^= v[i]; + + //second H0 = 0x0c7b1594 ... H7 = 0x9051b305 +} + +#if __CUDA_ARCH__ >= 200 +#if (__NV_POINTER_SIZE == 64) +# define SZCT uint64_t +#else +# define SZCT uint32_t +#endif +extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char, SZCT, int); +#endif + +__global__ +void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t /* __align__(16) */ h[8]; + uint32_t /* __align__(16) */ msg[16]; + const uint32_t nounce = startNounce + thread; + + #pragma unroll + for(int i=0; i<8; i++) + h[i] = c_IV256[i]; + + blake256_compress(h, c_PaddedMessage80, c_sigma, c_u256, 0x200); /* 512 = 0x200 */ + + // ------ Close: Bytes 64 to 80 ------ + +#if 0 /* __CUDA_ARCH__ >= 200 */ + __nvvm_memset((uint8_t*)(&msg[4]), 0, sizeof(msg)-16, 16); +#else + msg[5] = 0; + msg[6] = 0; + msg[7] = 0; + msg[8] = 0; + msg[9] = 0; + msg[10] = 0; + msg[11] = 0; + msg[12] = 0; + msg[14] = 0; +#endif + msg[0] = c_PaddedMessage80[16]; + msg[1] = c_PaddedMessage80[17]; + msg[2] = c_PaddedMessage80[18]; + msg[3] = cuda_swab32(nounce); // here or at 80 ? + + msg[4] = 0x80; // uchar[16] after buffer + msg[13] = 0x01000000; //((uint8_t*)msg)[55] = 1; // uchar[17 to 55] + msg[15] = 0x80020000; // 60-63 0x280 + + //h => {0xb5bfb2f9, 0x14cfcc63, 0xb85c549c, 0xc9b4184e, 0x67dfc6ce, 0x29e9904b, 0xd59ee74e, 0xfaa9c653} + //msg {0, 0, 0, 0, 0x80, 0...} + + blake256_compress(h, msg, c_sigma, c_u256, 0x280); // or 0x80 + //h => {0x0c7b1594, 0x52328517, 0x463db487, 0xdf5e39b7, 0x1322afaf, 0x14ed562c, 0xe9d18d7d, 0x9051b305} + + uint32_t *outHash = (uint32_t*) outputHash + 16*thread; // 16 = 4 x sizeof(uint32) + //#pragma unroll + for (int i=0; i < 8; i++) { + outHash[i] = cuda_swab32(h[i]); + } + } +} + +__host__ +void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +{ + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + size_t shared_size = 0; + + blake256_gpu_hash_80<<>>(threads, startNounce, d_outputHash); + + MyStreamSynchronize(NULL, order, thr_id); +} + +__global__ +void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = nounce - startNounce; + uint32_t *inpHash = &g_hash[16 * hashPosition]; + uint32_t hash[8]; + + #pragma unroll 8 + for (int i=0; i < 8; i++) + hash[i] = inpHash[i]; + + int i, position = -1; + bool rc = true; + + #pragma unroll 8 + for (i = 7; i >= 0; i--) { + if (hash[i] > pTarget[i] && position < i) { + position = i; + rc = false; + } + if (hash[i] < pTarget[i] && position < i) { + position = i; + rc = true; + } + } + + if(rc && resNounce[0] > nounce) + resNounce[0] = nounce; + } +} + +__host__ +uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) +{ + uint32_t result = 0xffffffff; + const int threadsperblock = 256; + + cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + size_t shared_size = 0; + + gpu_check_hash_64 <<>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); + + MyStreamSynchronize(NULL, order, thr_id); + + CUDA_SAFE_CALL(cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)); + + // cudaMemcpy() is asynch! + cudaThreadSynchronize(); + result = *h_resNounce[thr_id]; + + return result; +} + +__host__ +void blake256_cpu_init(int thr_id) +{ + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); + + CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t))); +} + +__host__ +void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget) +{ + uint32_t PaddedMessage[32]; + memcpy(PaddedMessage, pdata, 80); + memset(&PaddedMessage[20], 0, 48); + //for (int i=0; i<20; i++) + // PaddedMessage[i] = cuda_swab32(pdata[i]); + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)); +} + +#define NULLTEST 0 + +extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t endiandata[20]; + const uint32_t first_nonce = pdata[19]; + const int throughput = 256*256*2; + static bool init[8] = {0,0,0,0,0,0,0,0}; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x00000f; + + uint32_t Htarg = ptarget[7]; + + if (!init[thr_id]) { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + + blake256_cpu_init(thr_id); + + init[thr_id] = true; + } + +#if NULLTEST + // dev test with a null buffer 0x00000... + for (int k = 0; k < 20; k++) + pdata[k] = 0; + uint32_t vhash[8]; + blake32hash(vhash, pdata); +#endif + + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + blake256_cpu_setBlock_80(endiandata, (void*)ptarget); + + do { + int order = 0; + uint32_t foundNonce; + + // GPU + blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + +#if NULLTEST + uint32_t buf[8]; memset(buf, 0, sizeof buf); + CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost)); + CUDA_SAFE_CALL(cudaThreadSynchronize()); + //applog_hash((unsigned char*)buf); +#endif + foundNonce = cpu_check_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (foundNonce != 0xffffffff) + { + uint32_t vhashcpu[8]; + be32enc(&endiandata[19], foundNonce); + + blake32hash(vhashcpu, endiandata); + + if (opt_debug) + applog(LOG_DEBUG, "foundNonce = %08x",foundNonce); + + if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) + { + pdata[19] = foundNonce; + *hashes_done = pdata[19] - first_nonce + 1; + return 1; + } else { + applog(LOG_INFO, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce); + } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} + +//#define DEBUG_ALGO + +__host__ +int scanhash_blake256_cpu(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, uint64_t *hashes_done) +{ + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + + uint32_t __align__(32) hash64[8]; + uint32_t endiandata[32]; + + uint64_t htmax[] = { + 0, + 0xF, + 0xFF, + 0xFFF, + 0xFFFF, + 0x10000000 + }; + uint32_t masks[] = { + 0xFFFFFFFF, + 0xFFFFFFF0, + 0xFFFFFF00, + 0xFFFFF000, + 0xFFFF0000, + 0 + }; + + // we need bigendian data... + for (int kk=0; kk < 32; kk++) { + be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); + }; +#ifdef DEBUG_ALGO + if (Htarg != 0) + printf("[%d] Htarg=%X\n", thr_id, Htarg); +#endif + for (int m=0; m < 6; m++) { + if (Htarg <= htmax[m]) { + uint32_t mask = masks[m]; + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + blake32hash(hash64, endiandata); +#ifndef DEBUG_ALGO + if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } +#else + if (!(n % 0x1000) && !thr_id) printf("."); + if (!(hash64[7] & mask)) { + printf("[%d]",thr_id); + if (fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } +#endif + } while (n < max_nonce && !work_restart[thr_id].restart); + // see blake.c if else to understand the loop on htmax => mask + break; + } + } + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} diff --git a/build.sh b/build.sh index 17935f3..2905734 100755 --- a/build.sh +++ b/build.sh @@ -4,7 +4,7 @@ # export PATH="$PATH:/usr/local/cuda/bin/" -make distclean || echo clean +#make distclean || echo clean rm -f Makefile.in rm -f config.status diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 8bada54..d3ec423 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -397,6 +397,10 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" %(AdditionalOptions) 64 + + --ptxas-options=-O2 %(AdditionalOptions) + %(AdditionalOptions) + --ptxas-options=-O2 %(AdditionalOptions) %(AdditionalOptions) @@ -556,4 +560,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" - \ No newline at end of file + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index c972707..55c69aa 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -436,5 +436,8 @@ Source Files\CUDA\x17 + + Source Files\CUDA + diff --git a/configure.sh b/configure.sh index 134abd1..c0cdd0d 100755 --- a/configure.sh +++ b/configure.sh @@ -1 +1 @@ -./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda +./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda diff --git a/cpu-miner.c b/cpu-miner.c index a55f051..98d7daf 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -135,6 +135,7 @@ typedef enum { ALGO_QUARK, ALGO_ANIME, ALGO_FRESH, + ALGO_BLAKE, ALGO_NIST5, ALGO_WHC, ALGO_X11, @@ -155,6 +156,7 @@ static const char *algo_names[] = { "quark", "anime", "fresh", + "blake", "nist5", "whirl", "x11", @@ -235,6 +237,7 @@ Options:\n\ jackpot Jackpot hash\n\ quark Quark hash\n\ anime Animecoin hash\n\ + blake Blake 256 (like NEOS blake)\n\ fresh Freshcoin hash (shavite 80)\n\ nist5 NIST5 (TalkCoin) hash\n\ whirl Whirlcoin (old whirlpool)\n\ @@ -842,18 +845,23 @@ static void *miner_thread(void *userdata) int64_t max64; int rc; + // &work.data[19] + int wcmplen = 76; + uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); + if (have_stratum) { while (time(NULL) >= g_work_time + 120) sleep(1); pthread_mutex_lock(&g_work_lock); - if (work.data[19] >= end_nonce) + if ((*nonceptr) >= end_nonce) stratum_gen_work(&stratum, &g_work); } else { + int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; /* obtain new work from internal workio thread */ pthread_mutex_lock(&g_work_lock); - if (!have_stratum && (!have_longpoll || - time(NULL) >= g_work_time + LP_SCANTIME*3/4 || - work.data[19] >= end_nonce)) { + if (!have_stratum && + (time(NULL) - g_work_time >= min_scantime || + (*nonceptr) >= end_nonce)) { if (unlikely(!get_work(mythr, &g_work))) { applog(LOG_ERR, "work retrieval failed, exiting " "mining thread %d", mythr->id); @@ -867,11 +875,11 @@ static void *miner_thread(void *userdata) continue; } } - if (memcmp(work.data, g_work.data, 76)) { + if (memcmp(work.data, g_work.data, wcmplen)) { memcpy(&work, &g_work, sizeof(struct work)); - work.data[19] = 0xffffffffU / opt_n_threads * thr_id; + (*nonceptr) = 0xffffffffU / opt_n_threads * thr_id; } else - work.data[19]++; + (*nonceptr)++; pthread_mutex_unlock(&g_work_lock); work_restart[thr_id].restart = 0; @@ -881,13 +889,26 @@ static void *miner_thread(void *userdata) else max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) - time(NULL); + max64 *= (int64_t)thr_hashrates[thr_id]; - if (max64 <= 0) - max64 = (opt_algo == ALGO_JACKPOT) ? 0x1fffLL : 0xfffffLL; - if ((int64_t)work.data[19] + max64 > end_nonce) + + if (max64 <= 0) { + switch (opt_algo) { + case ALGO_JACKPOT: + max64 = 0x1fffLL; + break; + case ALGO_BLAKE: + max64 = 0xffffffLL; + break; + default: + max64 = 0xfffffLL; + break; + } + } + if ((int64_t)(*nonceptr) + max64 > end_nonce) max_nonce = end_nonce; else - max_nonce = (uint32_t)(work.data[19] + max64); + max_nonce = (uint32_t)((*nonceptr) + max64); hashes_done = 0; gettimeofday(&tv_start, NULL); @@ -931,6 +952,11 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_BLAKE: + rc = scanhash_blake32(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_ANIME: rc = scanhash_anime(thr_id, work.data, work.target, max_nonce, &hashes_done); diff --git a/miner.h b/miner.h index f3d4299..a23df96 100644 --- a/miner.h +++ b/miner.h @@ -241,6 +241,10 @@ extern int scanhash_fresh(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_blake32(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + extern int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -402,6 +406,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void groestlhash(void *state, const void *input); void myriadhash(void *state, const void *input); void fresh_hash(void *state, const void *input); +void blake32hash(void *output, const void *input); void nist5hash(void *state, const void *input); void quarkhash(void *state, const void *input); void wcoinhash(void *state, const void *input); diff --git a/missing b/missing deleted file mode 100644 index 1c8ff70..0000000 --- a/missing +++ /dev/null @@ -1,367 +0,0 @@ -#! /bin/sh -# Common stub for a few missing GNU programs while installing. - -scriptversion=2006-05-10.23 - -# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006 -# Free Software Foundation, Inc. -# Originally by Fran,cois Pinard , 1996. - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2, or (at your option) -# any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA -# 02110-1301, USA. - -# As a special exception to the GNU General Public License, if you -# distribute this file as part of a program that contains a -# configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - -if test $# -eq 0; then - echo 1>&2 "Try \`$0 --help' for more information" - exit 1 -fi - -run=: -sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p' -sed_minuso='s/.* -o \([^ ]*\).*/\1/p' - -# In the cases where this matters, `missing' is being run in the -# srcdir already. -if test -f configure.ac; then - configure_ac=configure.ac -else - configure_ac=configure.in -fi - -msg="missing on your system" - -case $1 in ---run) - # Try to run requested program, and just exit if it succeeds. - run= - shift - "$@" && exit 0 - # Exit code 63 means version mismatch. This often happens - # when the user try to use an ancient version of a tool on - # a file that requires a minimum version. In this case we - # we should proceed has if the program had been absent, or - # if --run hadn't been passed. - if test $? = 63; then - run=: - msg="probably too old" - fi - ;; - - -h|--h|--he|--hel|--help) - echo "\ -$0 [OPTION]... PROGRAM [ARGUMENT]... - -Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an -error status if there is no known handling for PROGRAM. - -Options: - -h, --help display this help and exit - -v, --version output version information and exit - --run try to run the given command, and emulate it if it fails - -Supported PROGRAM values: - aclocal touch file \`aclocal.m4' - autoconf touch file \`configure' - autoheader touch file \`config.h.in' - autom4te touch the output file, or create a stub one - automake touch all \`Makefile.in' files - bison create \`y.tab.[ch]', if possible, from existing .[ch] - flex create \`lex.yy.c', if possible, from existing .c - help2man touch the output file - lex create \`lex.yy.c', if possible, from existing .c - makeinfo touch the output file - tar try tar, gnutar, gtar, then tar without non-portable flags - yacc create \`y.tab.[ch]', if possible, from existing .[ch] - -Send bug reports to ." - exit $? - ;; - - -v|--v|--ve|--ver|--vers|--versi|--versio|--version) - echo "missing $scriptversion (GNU Automake)" - exit $? - ;; - - -*) - echo 1>&2 "$0: Unknown \`$1' option" - echo 1>&2 "Try \`$0 --help' for more information" - exit 1 - ;; - -esac - -# Now exit if we have it, but it failed. Also exit now if we -# don't have it and --version was passed (most likely to detect -# the program). -case $1 in - lex|yacc) - # Not GNU programs, they don't have --version. - ;; - - tar) - if test -n "$run"; then - echo 1>&2 "ERROR: \`tar' requires --run" - exit 1 - elif test "x$2" = "x--version" || test "x$2" = "x--help"; then - exit 1 - fi - ;; - - *) - if test -z "$run" && ($1 --version) > /dev/null 2>&1; then - # We have it, but it failed. - exit 1 - elif test "x$2" = "x--version" || test "x$2" = "x--help"; then - # Could not run --version or --help. This is probably someone - # running `$TOOL --version' or `$TOOL --help' to check whether - # $TOOL exists and not knowing $TOOL uses missing. - exit 1 - fi - ;; -esac - -# If it does not exist, or fails to run (possibly an outdated version), -# try to emulate it. -case $1 in - aclocal*) - echo 1>&2 "\ -WARNING: \`$1' is $msg. You should only need it if - you modified \`acinclude.m4' or \`${configure_ac}'. You might want - to install the \`Automake' and \`Perl' packages. Grab them from - any GNU archive site." - touch aclocal.m4 - ;; - - autoconf) - echo 1>&2 "\ -WARNING: \`$1' is $msg. You should only need it if - you modified \`${configure_ac}'. You might want to install the - \`Autoconf' and \`GNU m4' packages. Grab them from any GNU - archive site." - touch configure - ;; - - autoheader) - echo 1>&2 "\ -WARNING: \`$1' is $msg. You should only need it if - you modified \`acconfig.h' or \`${configure_ac}'. You might want - to install the \`Autoconf' and \`GNU m4' packages. Grab them - from any GNU archive site." - files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}` - test -z "$files" && files="config.h" - touch_files= - for f in $files; do - case $f in - *:*) touch_files="$touch_files "`echo "$f" | - sed -e 's/^[^:]*://' -e 's/:.*//'`;; - *) touch_files="$touch_files $f.in";; - esac - done - touch $touch_files - ;; - - automake*) - echo 1>&2 "\ -WARNING: \`$1' is $msg. You should only need it if - you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'. - You might want to install the \`Automake' and \`Perl' packages. - Grab them from any GNU archive site." - find . -type f -name Makefile.am -print | - sed 's/\.am$/.in/' | - while read f; do touch "$f"; done - ;; - - autom4te) - echo 1>&2 "\ -WARNING: \`$1' is needed, but is $msg. - You might have modified some files without having the - proper tools for further handling them. - You can get \`$1' as part of \`Autoconf' from any GNU - archive site." - - file=`echo "$*" | sed -n "$sed_output"` - test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` - if test -f "$file"; then - touch $file - else - test -z "$file" || exec >$file - echo "#! /bin/sh" - echo "# Created by GNU Automake missing as a replacement of" - echo "# $ $@" - echo "exit 0" - chmod +x $file - exit 1 - fi - ;; - - bison|yacc) - echo 1>&2 "\ -WARNING: \`$1' $msg. You should only need it if - you modified a \`.y' file. You may need the \`Bison' package - in order for those modifications to take effect. You can get - \`Bison' from any GNU archive site." - rm -f y.tab.c y.tab.h - if test $# -ne 1; then - eval LASTARG="\${$#}" - case $LASTARG in - *.y) - SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'` - if test -f "$SRCFILE"; then - cp "$SRCFILE" y.tab.c - fi - SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'` - if test -f "$SRCFILE"; then - cp "$SRCFILE" y.tab.h - fi - ;; - esac - fi - if test ! -f y.tab.h; then - echo >y.tab.h - fi - if test ! -f y.tab.c; then - echo 'main() { return 0; }' >y.tab.c - fi - ;; - - lex|flex) - echo 1>&2 "\ -WARNING: \`$1' is $msg. You should only need it if - you modified a \`.l' file. You may need the \`Flex' package - in order for those modifications to take effect. You can get - \`Flex' from any GNU archive site." - rm -f lex.yy.c - if test $# -ne 1; then - eval LASTARG="\${$#}" - case $LASTARG in - *.l) - SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'` - if test -f "$SRCFILE"; then - cp "$SRCFILE" lex.yy.c - fi - ;; - esac - fi - if test ! -f lex.yy.c; then - echo 'main() { return 0; }' >lex.yy.c - fi - ;; - - help2man) - echo 1>&2 "\ -WARNING: \`$1' is $msg. You should only need it if - you modified a dependency of a manual page. You may need the - \`Help2man' package in order for those modifications to take - effect. You can get \`Help2man' from any GNU archive site." - - file=`echo "$*" | sed -n "$sed_output"` - test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` - if test -f "$file"; then - touch $file - else - test -z "$file" || exec >$file - echo ".ab help2man is required to generate this page" - exit 1 - fi - ;; - - makeinfo) - echo 1>&2 "\ -WARNING: \`$1' is $msg. You should only need it if - you modified a \`.texi' or \`.texinfo' file, or any other file - indirectly affecting the aspect of the manual. The spurious - call might also be the consequence of using a buggy \`make' (AIX, - DU, IRIX). You might want to install the \`Texinfo' package or - the \`GNU make' package. Grab either from any GNU archive site." - # The file to touch is that specified with -o ... - file=`echo "$*" | sed -n "$sed_output"` - test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"` - if test -z "$file"; then - # ... or it is the one specified with @setfilename ... - infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'` - file=`sed -n ' - /^@setfilename/{ - s/.* \([^ ]*\) *$/\1/ - p - q - }' $infile` - # ... or it is derived from the source name (dir/f.texi becomes f.info) - test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info - fi - # If the file does not exist, the user really needs makeinfo; - # let's fail without touching anything. - test -f $file || exit 1 - touch $file - ;; - - tar) - shift - - # We have already tried tar in the generic part. - # Look for gnutar/gtar before invocation to avoid ugly error - # messages. - if (gnutar --version > /dev/null 2>&1); then - gnutar "$@" && exit 0 - fi - if (gtar --version > /dev/null 2>&1); then - gtar "$@" && exit 0 - fi - firstarg="$1" - if shift; then - case $firstarg in - *o*) - firstarg=`echo "$firstarg" | sed s/o//` - tar "$firstarg" "$@" && exit 0 - ;; - esac - case $firstarg in - *h*) - firstarg=`echo "$firstarg" | sed s/h//` - tar "$firstarg" "$@" && exit 0 - ;; - esac - fi - - echo 1>&2 "\ -WARNING: I can't seem to be able to run \`tar' with the given arguments. - You may want to install GNU tar or Free paxutils, or check the - command line arguments." - exit 1 - ;; - - *) - echo 1>&2 "\ -WARNING: \`$1' is needed, and is $msg. - You might have modified some files without having the - proper tools for further handling them. Check the \`README' file, - it often tells you about the needed prerequisites for installing - this package. You may also peek at any GNU archive site, in case - some other package would contain this missing \`$1' program." - exit 1 - ;; -esac - -exit 0 - -# Local variables: -# eval: (add-hook 'write-file-hooks 'time-stamp) -# time-stamp-start: "scriptversion=" -# time-stamp-format: "%:y-%02m-%02d.%02H" -# time-stamp-end: "$" -# End: diff --git a/quark/cuda_checkhash.cu b/quark/cuda_checkhash.cu index 3c41a02..1ce25ec 100644 --- a/quark/cuda_checkhash.cu +++ b/quark/cuda_checkhash.cu @@ -6,8 +6,8 @@ // Hash Target gegen das wir testen sollen __constant__ uint32_t pTarget[8]; -uint32_t *d_resNounce[8]; -uint32_t *h_resNounce[8]; +static uint32_t *d_resNounce[8]; +static uint32_t *h_resNounce[8]; // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); diff --git a/util.c b/util.c index 79a7a24..d9363dc 100644 --- a/util.c +++ b/util.c @@ -1393,6 +1393,10 @@ void print_hash_tests(void) myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash); + memset(hash, 0, sizeof hash); + blake32hash(&hash[0], &buf[0]); + printpfx("blake", hash); + memset(hash, 0, sizeof hash); nist5hash(&hash[0], &buf[0]); printpfx("nist5", hash); From bfe96c49b0bf321ed0776cb1cf31c4fe8a0a8b8d Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 25 Aug 2014 11:21:06 +0200 Subject: [PATCH 02/44] release 1.4, update README... --- README.md | 32 ++++++++++++++++++++++++++++++-- README.txt | 15 +++++++++++++++ configure.ac | 2 +- cpu-miner.c | 6 +++--- 4 files changed, 49 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index af8fd12..715b387 100644 --- a/README.md +++ b/README.md @@ -3,5 +3,33 @@ ccminer Christian Buchner's & Christian H.'s CUDA miner project -Fork by tpruvot@github with X14 support - BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo \ No newline at end of file +Fork by tpruvot@github with X14,X15,X17,WHIRL and M7 support + BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo + +A big part of my recent additions were wrote by [djm34](https://github.com/djm34), +You can also donate some beers (or redbulls) with these addresses : + XjPqpkCPoYJJYdQRrVByU7ySpVyeqJmSGU + +This variant was tested and built on Linux (ubuntu server 14.04) +and VStudio 2013. + +Note that the x86 releases are faster than x64 ones on Windows. + +About source code dependencies +------------------------------ + +This project requires some libraries to be built : + +- OpenSSL + +- Curl + +- pthreads + +- [mpir math library](http://www.mpir.org) + +You can download prebuilt .lib and dll on the [bitcointalk forum thread](https://bitcointalk.org/?topic=167229.0) + + +There is also a [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website. + diff --git a/README.txt b/README.txt index 32b2599..e6fe248 100644 --- a/README.txt +++ b/README.txt @@ -63,11 +63,14 @@ its command line interface and options. jackpot use to mine Jackpotcoin quark use to mine Quarkcoin anime use to mine Animecoin + blake use to mine NEOS (Blake 256) nist5 use to mine TalkCoin fresh use to mine Freshcoin + whirl use to mine Whirlcoin x11 use to mine DarkCoin x14 use to mine X14Coin x15 use to mine Halcyon + x17 use to mine X17 -d, --devices gives a comma separated list of CUDA device IDs to operate on. Device IDs start counting from 0! @@ -98,6 +101,7 @@ its command line interface and options. --benchmark run in offline benchmark mode --cputest debug hashes from cpu algorithms -c, --config=FILE load a JSON-format configuration file + -C, --color display colored output in a linux Terminal -V, --version display version information and exit -h, --help display this help text and exit @@ -148,6 +152,14 @@ features. >>> RELEASE HISTORY <<< + Sep. 1st 2014 add X17, optimized x15 and whirl + add blake (256 variant) + color support on Windows, + remove some dll dependencies (pthreads, msvcp) + + Aug. 18th 2014 add X14, X15, Whirl, and Fresh algos, + also add colors and nvprof cmd line support + June 15th 2014 add X13 and Diamond Groestl support. Thanks to tsiv and to Bombadil for the contributions! @@ -214,6 +226,9 @@ Notable contributors to this application are: Christian Buchner, Christian H. (Germany): CUDA implementation +Tanguy Pruvot : CUDA, blake, general code cleanup, tuneup for linux (Makefiles) + and some vstudio 2013 stuff... + and also many thanks to anyone else who contributed to the original cpuminer application (Jeff Garzik, pooler), it's original HVC-fork and the HVC-fork available at hvc.1gh.com diff --git a/configure.ac b/configure.ac index 14e9468..2f52cdf 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.08.12]) +AC_INIT([ccminer], [2014.09.01]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 98d7daf..69bafe9 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -47,7 +47,7 @@ BOOL WINAPI ConsoleHandler(DWORD); #pragma comment(lib, "winmm.lib") #endif -#define PROGRAM_NAME "minerd" +#define PROGRAM_NAME "ccminer" #define LP_SCANTIME 60 #define HEAVYCOIN_BLKHDR_SZ 84 #define MNR_BLKHDR_SZ 80 @@ -1238,7 +1238,7 @@ out: return NULL; } -#define PROGRAM_VERSION "1.3" +#define PROGRAM_VERSION "1.4" static void show_version_and_exit(void) { printf("%s v%s\n" @@ -1618,7 +1618,7 @@ int main(int argc, char *argv[]) printf("\t and HVC extension from http://hvc.1gh.com/" "\n\n"); printf("\tCuda additions Copyright 2014 Christian Buchner, Christian H.\n"); printf("\t BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM\n"); - printf("\tCleaned and optimized by Tanguy Pruvot\n"); + printf("\tInclude some of djm34 additions, cleaned by Tanguy Pruvot\n"); printf("\t BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo\n\n"); rpc_user = strdup(""); From 1fb9becc1f2b6a15d8ccea4d8314df9ddf0af4ed Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 1 Sep 2014 08:44:19 +0200 Subject: [PATCH 03/44] cpu-miner: sort algos by name, show reject reason --- cpu-miner.c | 54 +++++++++++++++++++++++----------------------- miner.h | 7 +++--- quark/animecoin.cu | 2 +- util.c | 20 ++++++++++------- 4 files changed, 44 insertions(+), 39 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 69bafe9..ef981d1 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -126,16 +126,16 @@ struct workio_cmd { }; typedef enum { - ALGO_HEAVY, /* Heavycoin hash */ - ALGO_MJOLLNIR, /* Mjollnir hash */ + ALGO_ANIME, + ALGO_BLAKE, + ALGO_FRESH, ALGO_FUGUE256, /* Fugue256 */ ALGO_GROESTL, - ALGO_MYR_GR, + ALGO_HEAVY, /* Heavycoin hash */ ALGO_JACKPOT, + ALGO_MJOLLNIR, /* Mjollnir hash */ + ALGO_MYR_GR, ALGO_QUARK, - ALGO_ANIME, - ALGO_FRESH, - ALGO_BLAKE, ALGO_NIST5, ALGO_WHC, ALGO_X11, @@ -147,17 +147,17 @@ typedef enum { } sha256_algos; static const char *algo_names[] = { - "heavy", - "mjollnir", + "anime", + "blake", + "fresh", "fugue256", "groestl", - "myr-gr", + "heavy", "jackpot", - "quark", - "anime", - "fresh", - "blake", + "mjollnir", + "myr-gr", "nist5", + "quark", "whirl", "x11", "x13", @@ -229,17 +229,17 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ + anime Animecoin hash\n\ + blake Blake 256 (like NEOS blake)\n\ + fresh Freshcoin hash (shavite 80)\n\ fugue256 Fuguecoin hash\n\ + groestl Groestlcoin hash\n\ heavy Heavycoin hash\n\ + jackpot Jackpot hash\n\ mjollnir Mjollnircoin hash\n\ - groestl Groestlcoin hash\n\ myr-gr Myriad-Groestl hash\n\ - jackpot Jackpot hash\n\ - quark Quark hash\n\ - anime Animecoin hash\n\ - blake Blake 256 (like NEOS blake)\n\ - fresh Freshcoin hash (shavite 80)\n\ nist5 NIST5 (TalkCoin) hash\n\ + quark Quark hash\n\ whirl Whirlcoin (old whirlpool)\n\ x11 X11 (DarkCoin) hash\n\ x13 X13 (MaruCoin) hash\n\ @@ -420,11 +420,11 @@ static void share_result(int result, const char *reason) 100. * accepted_count / (accepted_count + rejected_count), s, use_colors ? - (result ? CL_GRN "(yay!!!)" : CL_RED "(booooo)") + (result ? CL_GRN "yay!!!" : CL_RED "booooo") : (result ? "(yay!!!)" : "(booooo)")); - if (opt_debug && reason) - applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason); + if (reason) + applog(LOG_WARNING, "reject reason: %s", reason); } static bool submit_upstream_work(CURL *curl, struct work *work) @@ -856,7 +856,7 @@ static void *miner_thread(void *userdata) if ((*nonceptr) >= end_nonce) stratum_gen_work(&stratum, &g_work); } else { - int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; + int min_scantime = have_longpoll ? (LP_SCANTIME*3)/4 : opt_scantime; /* obtain new work from internal workio thread */ pthread_mutex_lock(&g_work_lock); if (!have_stratum && @@ -952,13 +952,13 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; - case ALGO_BLAKE: - rc = scanhash_blake32(thr_id, work.data, work.target, + case ALGO_ANIME: + rc = scanhash_anime(thr_id, work.data, work.target, max_nonce, &hashes_done); break; - case ALGO_ANIME: - rc = scanhash_anime(thr_id, work.data, work.target, + case ALGO_BLAKE: + rc = scanhash_blake32(thr_id, work.data, work.target, max_nonce, &hashes_done); break; diff --git a/miner.h b/miner.h index a23df96..3ce6571 100644 --- a/miner.h +++ b/miner.h @@ -400,13 +400,14 @@ extern void tq_thaw(struct thread_q *tq); void applog_hash(unsigned char *hash); void print_hash_tests(void); -unsigned int jackpothash(void *state, const void *input); +void animehash(void *state, const void *input); +void blake32hash(void *output, const void *input); +void fresh_hash(void *state, const void *input); void fugue256_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); +unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); void myriadhash(void *state, const void *input); -void fresh_hash(void *state, const void *input); -void blake32hash(void *output, const void *input); void nist5hash(void *state, const void *input); void quarkhash(void *state, const void *input); void wcoinhash(void *state, const void *input); diff --git a/quark/animecoin.cu b/quark/animecoin.cu index c19275d..4b2d097 100644 --- a/quark/animecoin.cu +++ b/quark/animecoin.cu @@ -57,7 +57,7 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, int order); // Original Quarkhash Funktion aus einem miner Quelltext -inline void animehash(void *state, const void *input) +extern "C" void animehash(void *state, const void *input) { sph_blake512_context ctx_blake; sph_bmw512_context ctx_bmw; diff --git a/util.c b/util.c index d9363dc..f947d37 100644 --- a/util.c +++ b/util.c @@ -1373,6 +1373,18 @@ void print_hash_tests(void) printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n"); + memset(hash, 0, sizeof hash); + animehash(&hash[0], &buf[0]); + printpfx("anime", hash); + + memset(hash, 0, sizeof hash); + blake32hash(&hash[0], &buf[0]); + printpfx("blake", hash); + + memset(hash, 0, sizeof hash); + fresh_hash(&hash[0], &buf[0]); + printpfx("fresh", hash); + memset(hash, 0, sizeof hash); fugue256_hash(&hash[0], &buf[0], 32); printpfx("fugue256", hash); @@ -1393,10 +1405,6 @@ void print_hash_tests(void) myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash); - memset(hash, 0, sizeof hash); - blake32hash(&hash[0], &buf[0]); - printpfx("blake", hash); - memset(hash, 0, sizeof hash); nist5hash(&hash[0], &buf[0]); printpfx("nist5", hash); @@ -1405,10 +1413,6 @@ void print_hash_tests(void) quarkhash(&hash[0], &buf[0]); printpfx("quark", hash); - memset(hash, 0, sizeof hash); - fresh_hash(&hash[0], &buf[0]); - printpfx("fresh", hash); - memset(hash, 0, sizeof hash); wcoinhash(&hash[0], &buf[0]); printpfx("whirl", hash); From 4a52d0553b0076b984be480725fa67689c544647 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 1 Sep 2014 10:22:32 +0200 Subject: [PATCH 04/44] debug: show json methods, hide hash/target if ok --- util.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/util.c b/util.c index f947d37..6fd2b71 100644 --- a/util.c +++ b/util.c @@ -559,7 +559,7 @@ bool fulltest(const uint32_t *hash, const uint32_t *target) } } - if (opt_debug) { + if (!rc || opt_debug) { uint32_t hash_be[8], target_be[8]; char *hash_str, *target_str; @@ -572,7 +572,7 @@ bool fulltest(const uint32_t *hash, const uint32_t *target) applog(LOG_DEBUG, "DEBUG: %s\nHash: %s\nTarget: %s", rc ? "hash <= target" - : "hash > target (false positive)", + : CL_YLW "hash > target (false positive)" CL_N, hash_str, target_str); @@ -1205,6 +1205,10 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) id = json_object_get(val, "id"); params = json_object_get(val, "params"); + if (opt_debug) { + applog(LOG_DEBUG, "method: %s", s); + } + if (!strcasecmp(method, "mining.notify")) { ret = stratum_notify(sctx, params); goto out; @@ -1368,7 +1372,8 @@ extern void applog_hash(unsigned char *hash) void print_hash_tests(void) { - unsigned char buf[128], hash[128], s[128]; + char s[128] = {'\0'}; + unsigned char buf[128], hash[128]; memset(buf, 0, sizeof buf); printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n"); From 0aeac878ef60840f3123354037cd56a89d2e94e6 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 1 Sep 2014 06:12:55 +0200 Subject: [PATCH 05/44] blake: tune up and cleanup, ~100 MH/s on a normal 750Ti tested on linux and windows (x86 binary)... but there is a high number of duplicated shares... weird --- blake32.cu | 197 +++++++++++++++------------------------------------- cpu-miner.c | 6 +- 2 files changed, 59 insertions(+), 144 deletions(-) diff --git a/blake32.cu b/blake32.cu index 882594f..4468368 100644 --- a/blake32.cu +++ b/blake32.cu @@ -25,14 +25,6 @@ extern "C" void blake32hash(void *output, const void *input) #include "cuda_helper.h" -#if __CUDA_ARCH__ < 350 - // Kepler (Compute 3.0) + Host - #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) -#else - // Kepler (Compute 3.5 / 5.0) - #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) -#endif - // in cpu-miner.c extern bool opt_benchmark; extern bool opt_debug; @@ -47,9 +39,11 @@ __constant__ static uint32_t pTarget[8]; __constant__ -static uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) +static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding) + static uint32_t *d_resNounce[8]; static uint32_t *h_resNounce[8]; +static bool init_made = false; __constant__ static uint8_t c_sigma[16][16]; @@ -120,14 +114,14 @@ static const uint32_t c_u256[16] = { #define GS(a,b,c,d,e) { \ v[a] += (m[sigma[i][e]] ^ u256[sigma[i][e+1]]) + v[b]; \ - v[d] = ROTR32(v[d] ^ v[a], 16); \ + v[d] = SPH_ROTR32(v[d] ^ v[a], 16); \ v[c] += v[d]; \ - v[b] = ROTR32(v[b] ^ v[c], 12); \ + v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ \ v[a] += (m[sigma[i][e+1]] ^ u256[sigma[i][e]]) + v[b]; \ - v[d] = ROTR32(v[d] ^ v[a], 8); \ + v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \ v[c] += v[d]; \ - v[b] = ROTR32(v[b] ^ v[c], 7); \ + v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ } __device__ static @@ -138,11 +132,10 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con //#pragma unroll for (int i = 0; i < 16; ++i) { - m[i] = cuda_swab32(block[i]); - //m[i] = block[i]; + m[i] = block[i]; } - #pragma unroll + #pragma unroll 8 for(int i = 0; i < 8; i++) v[i] = h[i]; @@ -156,10 +149,6 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con v[14] = u256[6]; v[15] = u256[7]; - // on a 80-bytes null buffer : - // first : v = {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, ...} - // second : v = {0xb5bfb2f9, 0x14cfcc63, 0xb85c549c, 0xc9b4184e, ..., 0x299f3350, 0x082efa98, 0xec4e6c89} - //#pragma unroll for (int i = 0; i < 14; i++) { /* column step */ @@ -177,11 +166,10 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con //#pragma unroll 16 for(int i = 0; i < 16; i++) h[i % 8] ^= v[i]; - - //second H0 = 0x0c7b1594 ... H7 = 0x9051b305 } #if __CUDA_ARCH__ >= 200 +/* memory should be aligned to use __nvvm_memset */ #if (__NV_POINTER_SIZE == 64) # define SZCT uint64_t #else @@ -196,9 +184,9 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t /* __align__(16) */ h[8]; - uint32_t /* __align__(16) */ msg[16]; const uint32_t nounce = startNounce + thread; + uint32_t /* __align__(8) */ msg[16]; + uint32_t h[8]; #pragma unroll for(int i=0; i<8; i++) @@ -209,9 +197,9 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) // ------ Close: Bytes 64 to 80 ------ #if 0 /* __CUDA_ARCH__ >= 200 */ - __nvvm_memset((uint8_t*)(&msg[4]), 0, sizeof(msg)-16, 16); + __nvvm_memset((uint8_t*)(&msg[4]), 0, sizeof(msg)-16, 8); #else - msg[5] = 0; + msg[5] = 0; // uchar[17 to 55] msg[6] = 0; msg[7] = 0; msg[8] = 0; @@ -219,25 +207,22 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) msg[10] = 0; msg[11] = 0; msg[12] = 0; + msg[14] = 0; #endif msg[0] = c_PaddedMessage80[16]; msg[1] = c_PaddedMessage80[17]; msg[2] = c_PaddedMessage80[18]; - msg[3] = cuda_swab32(nounce); // here or at 80 ? - - msg[4] = 0x80; // uchar[16] after buffer - msg[13] = 0x01000000; //((uint8_t*)msg)[55] = 1; // uchar[17 to 55] - msg[15] = 0x80020000; // 60-63 0x280 + msg[3] = nounce; /* our tested value */ + msg[4] = 0x80000000; //cuda_swab32(0x80U); - //h => {0xb5bfb2f9, 0x14cfcc63, 0xb85c549c, 0xc9b4184e, 0x67dfc6ce, 0x29e9904b, 0xd59ee74e, 0xfaa9c653} - //msg {0, 0, 0, 0, 0x80, 0...} + msg[13] = 1; + msg[15] = 0x280; // 60-63 blake256_compress(h, msg, c_sigma, c_u256, 0x280); // or 0x80 - //h => {0x0c7b1594, 0x52328517, 0x463db487, 0xdf5e39b7, 0x1322afaf, 0x14ed562c, 0xe9d18d7d, 0x9051b305} - uint32_t *outHash = (uint32_t*) outputHash + 16*thread; // 16 = 4 x sizeof(uint32) - //#pragma unroll + uint32_t *outHash = (uint32_t*) outputHash + thread; + //#pragma unroll 8 for (int i=0; i < 8; i++) { outHash[i] = cuda_swab32(h[i]); } @@ -247,7 +232,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) __host__ void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) { - const int threadsperblock = 256; + const int threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); @@ -265,28 +250,25 @@ void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVecto int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + const uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[16 * hashPosition]; + uint32_t *inpHash = &g_hash[hashPosition]; uint32_t hash[8]; #pragma unroll 8 for (int i=0; i < 8; i++) hash[i] = inpHash[i]; - int i, position = -1; - bool rc = true; - - #pragma unroll 8 + /* to enhance ? */ + int i, rc = 1, position = -1; for (i = 7; i >= 0; i--) { + // rc &= (hash[i] <= pTarget[i]); if (hash[i] > pTarget[i] && position < i) { - position = i; - rc = false; + rc = false; position = i; } if (hash[i] < pTarget[i] && position < i) { - position = i; - rc = true; + rc = true; position = i; } } @@ -298,8 +280,8 @@ void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVecto __host__ uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) { + const int threadsperblock = 128; uint32_t result = 0xffffffff; - const int threadsperblock = 256; cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); @@ -309,14 +291,12 @@ uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32 size_t shared_size = 0; gpu_check_hash_64 <<>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); - MyStreamSynchronize(NULL, order, thr_id); - CUDA_SAFE_CALL(cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)); - - // cudaMemcpy() is asynch! - cudaThreadSynchronize(); - result = *h_resNounce[thr_id]; + if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + cudaThreadSynchronize(); + result = *h_resNounce[thr_id]; + } return result; } @@ -325,9 +305,9 @@ __host__ void blake256_cpu_init(int thr_id) { CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); - - CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t))); + init_made = true; } __host__ @@ -336,8 +316,6 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget) uint32_t PaddedMessage[32]; memcpy(PaddedMessage, pdata, 80); memset(&PaddedMessage[20], 0, 48); - //for (int i=0; i<20; i++) - // PaddedMessage[i] = cuda_swab32(pdata[i]); CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)); @@ -348,19 +326,19 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget) extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { - uint32_t endiandata[20]; const uint32_t first_nonce = pdata[19]; - const int throughput = 256*256*2; + const int throughput = 128 * 2048; static bool init[8] = {0,0,0,0,0,0,0,0}; + uint32_t endiandata[20]; + uint32_t Htarg = ptarget[7]; + int rc = 0; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x00000f; - - uint32_t Htarg = ptarget[7]; + ((uint32_t*)ptarget)[7] = Htarg = 0x00000f; if (!init[thr_id]) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * throughput)); blake256_cpu_init(thr_id); @@ -375,11 +353,11 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta blake32hash(vhash, pdata); #endif + blake256_cpu_setBlock_80(pdata, (void*)ptarget); + for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); - blake256_cpu_setBlock_80(endiandata, (void*)ptarget); - do { int order = 0; uint32_t foundNonce; @@ -401,14 +379,14 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta blake32hash(vhashcpu, endiandata); - if (opt_debug) - applog(LOG_DEBUG, "foundNonce = %08x",foundNonce); + //if (opt_debug) + // applog(LOG_DEBUG, "foundNonce = %08x",foundNonce); if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { pdata[19] = foundNonce; - *hashes_done = pdata[19] - first_nonce + 1; - return 1; + rc = 1; + goto exit_scan; } else { applog(LOG_INFO, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce); } @@ -418,77 +396,12 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); +exit_scan: *hashes_done = pdata[19] - first_nonce + 1; - return 0; -} - -//#define DEBUG_ALGO - -__host__ -int scanhash_blake256_cpu(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, uint64_t *hashes_done) -{ - uint32_t n = pdata[19] - 1; - const uint32_t first_nonce = pdata[19]; - const uint32_t Htarg = ptarget[7]; - - uint32_t __align__(32) hash64[8]; - uint32_t endiandata[32]; - - uint64_t htmax[] = { - 0, - 0xF, - 0xFF, - 0xFFF, - 0xFFFF, - 0x10000000 - }; - uint32_t masks[] = { - 0xFFFFFFFF, - 0xFFFFFFF0, - 0xFFFFFF00, - 0xFFFFF000, - 0xFFFF0000, - 0 - }; - - // we need bigendian data... - for (int kk=0; kk < 32; kk++) { - be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); - }; -#ifdef DEBUG_ALGO - if (Htarg != 0) - printf("[%d] Htarg=%X\n", thr_id, Htarg); -#endif - for (int m=0; m < 6; m++) { - if (Htarg <= htmax[m]) { - uint32_t mask = masks[m]; - do { - pdata[19] = ++n; - be32enc(&endiandata[19], n); - blake32hash(hash64, endiandata); -#ifndef DEBUG_ALGO - if ((!(hash64[7] & mask)) && fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } -#else - if (!(n % 0x1000) && !thr_id) printf("."); - if (!(hash64[7] & mask)) { - printf("[%d]",thr_id); - if (fulltest(hash64, ptarget)) { - *hashes_done = n - first_nonce + 1; - return true; - } - } -#endif - } while (n < max_nonce && !work_restart[thr_id].restart); - // see blake.c if else to understand the loop on htmax => mask - break; - } + if (init_made && opt_debug && h_resNounce[thr_id]) { + // made auto ??? + //applog(LOG_DEBUG, "%08x", h_resNounce[thr_id]); + //cudaFreeHost(h_resNounce[thr_id]); } - - *hashes_done = n - first_nonce + 1; - pdata[19] = n; - return 0; + return rc; } diff --git a/cpu-miner.c b/cpu-miner.c index ef981d1..2b409d5 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -805,6 +805,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH) diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); + else if (opt_algo == ALGO_BLAKE) + diff_to_target(work->target, sctx->job.diff / (4.0 * opt_difficulty)); else diff_to_target(work->target, sctx->job.diff / opt_difficulty); } @@ -898,8 +900,8 @@ static void *miner_thread(void *userdata) max64 = 0x1fffLL; break; case ALGO_BLAKE: - max64 = 0xffffffLL; - break; + //max64 = 0x1000000LL; + //break; default: max64 = 0xfffffLL; break; From 530732458add6c4c3836606d028930f3581c0a5f Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 1 Sep 2014 12:22:51 +0200 Subject: [PATCH 06/44] blake: use a constant for threads, reduce mallocated d_hash size and clean a bit more... --- blake32.cu | 37 +++++++++++++++---------------------- cpuminer-config.h | 6 +++--- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/blake32.cu b/blake32.cu index 4468368..e3d0bf8 100644 --- a/blake32.cu +++ b/blake32.cu @@ -12,6 +12,9 @@ extern "C" { #include } +/* threads per block */ +#define TPB 128 + /* hash by cpu with blake 256 */ extern "C" void blake32hash(void *output, const void *input) { @@ -43,7 +46,6 @@ static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes static uint32_t *d_resNounce[8]; static uint32_t *h_resNounce[8]; -static bool init_made = false; __constant__ static uint8_t c_sigma[16][16]; @@ -214,7 +216,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) msg[1] = c_PaddedMessage80[17]; msg[2] = c_PaddedMessage80[18]; msg[3] = nounce; /* our tested value */ - msg[4] = 0x80000000; //cuda_swab32(0x80U); + msg[4] = 0x80000000UL; //cuda_swab32(0x80U); msg[13] = 1; msg[15] = 0x280; // 60-63 @@ -232,7 +234,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) __host__ void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) { - const int threadsperblock = 128; + const int threadsperblock = TPB; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); @@ -280,7 +282,7 @@ void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVecto __host__ uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) { - const int threadsperblock = 128; + const int threadsperblock = TPB; uint32_t result = 0xffffffff; cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); @@ -307,7 +309,6 @@ void blake256_cpu_init(int thr_id) CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t))); - init_made = true; } __host__ @@ -327,7 +328,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - const int throughput = 128 * 2048; + const int throughput = TPB * 2048; static bool init[8] = {0,0,0,0,0,0,0,0}; uint32_t endiandata[20]; uint32_t Htarg = ptarget[7]; @@ -338,10 +339,8 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta if (!init[thr_id]) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * throughput)); - + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 48 * throughput)); // not sure for this size... blake256_cpu_init(thr_id); - init[thr_id] = true; } @@ -349,8 +348,6 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta // dev test with a null buffer 0x00000... for (int k = 0; k < 20; k++) pdata[k] = 0; - uint32_t vhash[8]; - blake32hash(vhash, pdata); #endif blake256_cpu_setBlock_80(pdata, (void*)ptarget); @@ -362,7 +359,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta int order = 0; uint32_t foundNonce; - // GPU + // GPU HASH blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); #if NULLTEST @@ -379,16 +376,17 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta blake32hash(vhashcpu, endiandata); - //if (opt_debug) - // applog(LOG_DEBUG, "foundNonce = %08x",foundNonce); - if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { pdata[19] = foundNonce; rc = 1; goto exit_scan; - } else { - applog(LOG_INFO, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce); + } + else if (vhashcpu[7] > Htarg) { + applog(LOG_WARNING, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg); + } + else { + applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); } } @@ -398,10 +396,5 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta exit_scan: *hashes_done = pdata[19] - first_nonce + 1; - if (init_made && opt_debug && h_resNounce[thr_id]) { - // made auto ??? - //applog(LOG_DEBUG, "%08x", h_resNounce[thr_id]); - //cudaFreeHost(h_resNounce[thr_id]); - } return rc; } diff --git a/cpuminer-config.h b/cpuminer-config.h index 0d0f042..0fafa85 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -156,7 +156,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 2014.08.12" +#define PACKAGE_STRING "ccminer 2014.09.01" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "ccminer" @@ -165,7 +165,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2014.08.12" +#define PACKAGE_VERSION "2014.09.01" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be @@ -188,7 +188,7 @@ #define USE_XOP 1 /* Version number of package */ -#define VERSION "2014.08.12" +#define VERSION "2014.09.01" /* Define curl_free() as free() if our version of curl lacks curl_free. */ /* #undef curl_free */ From 1f99aae0ff621f4f85f119d811a3f1a8d2204f60 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 1 Sep 2014 18:49:23 +0200 Subject: [PATCH 07/44] exit on repeated duplicate shares (to enhance) create a new function proper_exit() to do common stuff on exit... --- cpu-miner.c | 47 ++++++++++++++++++++++++++++------------------- miner.h | 1 + 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 2b409d5..d92c7e0 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -209,7 +209,7 @@ static struct stratum_ctx stratum; pthread_mutex_t applog_lock; static pthread_mutex_t stats_lock; - +static uint8_t duplicate_shares = 0; static unsigned long accepted_count = 0L; static unsigned long rejected_count = 0L; static double *thr_hashrates; @@ -349,6 +349,13 @@ static struct work g_work; static time_t g_work_time; static pthread_mutex_t g_work_lock; + +void proper_exit(int reason) +{ + cuda_devicereset(); + exit(reason); +} + static bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen) { @@ -423,8 +430,17 @@ static void share_result(int result, const char *reason) (result ? CL_GRN "yay!!!" : CL_RED "booooo") : (result ? "(yay!!!)" : "(booooo)")); - if (reason) + if (reason) { + if (!strcmp(reason, "Duplicate share")) { + duplicate_shares++; + if (duplicate_shares > 3) { + // exit from app (until auto restart) + applog(LOG_WARNING, "Auto exit to prevent stratum bans: %s", reason); + proper_exit(1); + } + } applog(LOG_WARNING, "reject reason: %s", reason); + } } static bool submit_upstream_work(CURL *curl, struct work *work) @@ -1253,7 +1269,7 @@ static void show_version_and_exit(void) PTW32_VERSION_STRING, #endif curl_version()); - exit(0); + proper_exit(0); } static void show_usage_and_exit(int status) @@ -1262,7 +1278,7 @@ static void show_usage_and_exit(int status) fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n"); else printf(usage); - exit(status); + proper_exit(status); } static void parse_arg (int key, char *arg) @@ -1297,7 +1313,7 @@ static void parse_arg (int key, char *arg) #endif if (!json_is_object(opt_config)) { applog(LOG_ERR, "JSON decode of %s failed", arg); - exit(1); + proper_exit(1); } break; } @@ -1440,7 +1456,7 @@ static void parse_arg (int key, char *arg) break; case 1006: print_hash_tests(); - exit(0); + proper_exit(0); break; case 1003: want_longpoll = false; @@ -1462,7 +1478,7 @@ static void parse_arg (int key, char *arg) device_map[opt_n_threads++] = atoi(pch); else { applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch)); - exit(1); + proper_exit(1); } } else { int device = cuda_finddevice(pch); @@ -1470,7 +1486,7 @@ static void parse_arg (int key, char *arg) device_map[opt_n_threads++] = device; else { applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch); - exit(1); + proper_exit(1); } } pch = strtok (NULL, ","); @@ -1572,13 +1588,11 @@ static void signal_handler(int sig) case SIGINT: signal(sig, SIG_IGN); applog(LOG_INFO, "SIGINT received, exiting"); - cuda_devicereset(); - exit(0); + proper_exit(0); break; case SIGTERM: applog(LOG_INFO, "SIGTERM received, exiting"); - cuda_devicereset(); - exit(0); + proper_exit(0); break; } } @@ -1588,13 +1602,11 @@ BOOL WINAPI ConsoleHandler(DWORD dwType) switch (dwType) { case CTRL_C_EVENT: applog(LOG_INFO, "CTRL_C_EVENT received, exiting"); - cuda_devicereset(); - exit(0); + proper_exit(0); break; case CTRL_BREAK_EVENT: applog(LOG_INFO, "CTRL_BREAK_EVENT received, exiting"); - cuda_devicereset(); - exit(0); + proper_exit(0); break; default: return false; @@ -1785,8 +1797,5 @@ int main(int argc, char *argv[]) applog(LOG_INFO, "workio thread dead, exiting."); - // nvprof requires this - cuda_devicereset(); - return 0; } diff --git a/miner.h b/miner.h index 3ce6571..e1e2d8d 100644 --- a/miner.h +++ b/miner.h @@ -396,6 +396,7 @@ extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime); extern void tq_freeze(struct thread_q *tq); extern void tq_thaw(struct thread_q *tq); +void proper_exit(int reason); void applog_hash(unsigned char *hash); From 1b8c3c12fa5bb83afbb02f9d5f60586939f36d86 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 2 Sep 2014 03:38:57 +0200 Subject: [PATCH 08/44] debug: a new boolean to log or not json rpc data --- cpu-miner.c | 6 ++++++ miner.h | 1 + util.c | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cpu-miner.c b/cpu-miner.c index d92c7e0..9ff8375 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -168,6 +168,7 @@ static const char *algo_names[] = { }; bool opt_debug = false; +bool opt_debug_rpc = false; bool opt_protocol = false; bool opt_benchmark = false; bool want_longpoll = true; @@ -522,6 +523,10 @@ static bool submit_upstream_work(CURL *curl, struct work *work) json_decref(val); } + if (opt_debug_rpc) { + applog(LOG_DEBUG, "submit: %s", s); + } + rc = true; out: @@ -1325,6 +1330,7 @@ static void parse_arg (int key, char *arg) break; case 'D': opt_debug = true; + opt_debug_rpc = true; break; case 'p': free(rpc_pass); diff --git a/miner.h b/miner.h index e1e2d8d..c9b2e44 100644 --- a/miner.h +++ b/miner.h @@ -285,6 +285,7 @@ struct work_restart { }; extern bool opt_debug; +extern bool opt_debug_rpc; extern bool opt_protocol; extern int opt_timeout; extern bool want_longpoll; diff --git a/util.c b/util.c index 6fd2b71..a9e0ae2 100644 --- a/util.c +++ b/util.c @@ -1205,7 +1205,7 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) id = json_object_get(val, "id"); params = json_object_get(val, "params"); - if (opt_debug) { + if (opt_debug_rpc) { applog(LOG_DEBUG, "method: %s", s); } From 2d42ae6de586a6ae8cbfd01806a273fd5cc4b262 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 2 Sep 2014 05:09:31 +0200 Subject: [PATCH 09/44] stratum: handle a small cache of submitted jobs Prevent to send duplicated shares on some pools like hashharder.. This cache keeps submitted job/nounces of the last 15 minutes so, remove exit on repeated duplicate shares, the submitted cache now handles this problem. Signed-off-by: Tanguy Pruvot --- Makefile.am | 1 + README.md | 6 +-- ccminer.vcxproj | 3 +- ccminer.vcxproj.filters | 5 ++- cpu-miner.c | 29 ++++++++------ hashlog.cpp | 84 +++++++++++++++++++++++++++++++++++++++++ miner.h | 6 +++ 7 files changed, 117 insertions(+), 17 deletions(-) create mode 100644 hashlog.cpp diff --git a/Makefile.am b/Makefile.am index 3935afe..c73d9d2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,6 +17,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/sys/time.h compat/getopt/getopt.h \ cpu-miner.c util.c hefty1.c scrypt.c \ + hashlog.cpp \ heavy/heavy.cu \ heavy/cuda_blake512.cu heavy/cuda_blake512.h \ heavy/cuda_combine.cu heavy/cuda_combine.h \ diff --git a/README.md b/README.md index 715b387..2a2485b 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,12 @@ ccminer Christian Buchner's & Christian H.'s CUDA miner project -Fork by tpruvot@github with X14,X15,X17,WHIRL and M7 support +Fork by tpruvot@github with X14,X15,X17,WHIRL and Blake256 support BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo + [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer) A big part of my recent additions were wrote by [djm34](https://github.com/djm34), -You can also donate some beers (or redbulls) with these addresses : - XjPqpkCPoYJJYdQRrVByU7ySpVyeqJmSGU +You can also donate some beers (or redbulls) This variant was tested and built on Linux (ubuntu server 14.04) and VStudio 2013. diff --git a/ccminer.vcxproj b/ccminer.vcxproj index d3ec423..509715b 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -243,6 +243,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" + @@ -560,4 +561,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" - + \ No newline at end of file diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 55c69aa..93e331c 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -180,6 +180,9 @@ Source Files + + Source Files + @@ -440,4 +443,4 @@ Source Files\CUDA - + \ No newline at end of file diff --git a/cpu-miner.c b/cpu-miner.c index 9ff8375..513d4f8 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -210,7 +210,6 @@ static struct stratum_ctx stratum; pthread_mutex_t applog_lock; static pthread_mutex_t stats_lock; -static uint8_t duplicate_shares = 0; static unsigned long accepted_count = 0L; static unsigned long rejected_count = 0L; static double *thr_hashrates; @@ -354,6 +353,7 @@ static pthread_mutex_t g_work_lock; void proper_exit(int reason) { cuda_devicereset(); + hashlog_purge_all(); exit(reason); } @@ -432,14 +432,6 @@ static void share_result(int result, const char *reason) : (result ? "(yay!!!)" : "(booooo)")); if (reason) { - if (!strcmp(reason, "Duplicate share")) { - duplicate_shares++; - if (duplicate_shares > 3) { - // exit from app (until auto restart) - applog(LOG_WARNING, "Auto exit to prevent stratum bans: %s", reason); - proper_exit(1); - } - } applog(LOG_WARNING, "reject reason: %s", reason); } } @@ -460,6 +452,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) } if (have_stratum) { + uint32_t sent; uint32_t ntime, nonce; uint16_t nvote; char *ntimestr, *noncestr, *xnonce2str, *nvotestr; @@ -472,6 +465,16 @@ static bool submit_upstream_work(CURL *curl, struct work *work) noncestr = bin2hex((const unsigned char *)(&nonce), 4); xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len); nvotestr = bin2hex((const unsigned char *)(&nvote), 2); + + sent = hashlog_already_submittted(work->job_id, nonce); + if (sent > 0) { + sent = (uint32_t) time(NULL) - sent; + if (!opt_quiet) + applog(LOG_WARNING, "skip submit, nonce %s was already sent %u seconds ago", noncestr, sent); + rc = true; + goto out; + } + if (opt_algo == ALGO_HEAVY) { sprintf(s, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", @@ -490,6 +493,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work) applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); goto out; } + + hashlog_remember_submit(work->job_id, nonce); + } else { /* build hex string */ @@ -826,8 +832,6 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH) diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); - else if (opt_algo == ALGO_BLAKE) - diff_to_target(work->target, sctx->job.diff / (4.0 * opt_difficulty)); else diff_to_target(work->target, sctx->job.diff / opt_difficulty); } @@ -1237,8 +1241,9 @@ static void *stratum_thread(void *userdata) pthread_mutex_unlock(&g_work_lock); if (stratum.job.clean) { if (!opt_quiet) - applog(LOG_BLUE, "%s send a new %s block", short_url, algo_names[opt_algo]); + applog(LOG_BLUE, "%s send a new %s job", short_url, algo_names[opt_algo]); restart_threads(); + hashlog_purge_old(); } } diff --git a/hashlog.cpp b/hashlog.cpp new file mode 100644 index 0000000..ded566e --- /dev/null +++ b/hashlog.cpp @@ -0,0 +1,84 @@ +#include +#include +#include + +#include "miner.h" + +static std::map tlastshares; + +/** + * Purge entries after 15 minutes + */ +#define LOG_PURGE_TIMEOUT 15*60 + +/** + * Store submitted nounces of a job + */ +extern "C" void hashlog_remember_submit(char* jobid, uint32_t nounce) +{ + char *ptr; + uint64_t njobid = (uint64_t) strtoul(jobid, &ptr, 16); + uint64_t key = (njobid << 32) + nounce; + tlastshares[key] = (uint32_t) time(NULL); +} + +/** + * @return time of submission + */ +extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce) +{ + char *ptr; + uint32_t ret = 0; + uint64_t njobid = (uint64_t) strtoul(jobid, &ptr, 16); + uint64_t key = (njobid << 32) + nounce; + std::map::iterator i = tlastshares.find(key); + if (i != tlastshares.end()) + ret = (uint32_t) tlastshares[key]; + return ret; +} + +/** + * Remove entries of a job... not used yet + */ +extern "C" void hashlog_purge_job(char* jobid) +{ + char *ptr; + uint64_t njobid = strtoul(jobid, &ptr, 16); + uint64_t keypfx = (njobid << 32); + std::map::iterator i = tlastshares.begin(); + while (i != tlastshares.end()) { + if ((keypfx & i->first) != 0) + tlastshares.erase(i); + i++; + } +} + +/** + * Remove old entries to reduce memory usage + */ +extern "C" void hashlog_purge_old(void) +{ + int deleted = 0; + uint32_t now = (uint32_t) time(NULL); + std::map::iterator i = tlastshares.begin(); + while (i != tlastshares.end()) { + if ((now - i->second) > LOG_PURGE_TIMEOUT) { + deleted++; + tlastshares.erase(i); + } + i++; + } + if (opt_debug && deleted) { + applog(LOG_DEBUG, "hashlog: %d/%d purged", + deleted, tlastshares.size()); + } +} + +/** + * Reset the submitted nounce cache + */ +extern "C" void hashlog_purge_all(void) +{ + tlastshares.clear(); +} + diff --git a/miner.h b/miner.h index c9b2e44..b986197 100644 --- a/miner.h +++ b/miner.h @@ -388,6 +388,12 @@ bool stratum_subscribe(struct stratum_ctx *sctx); bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); +void hashlog_remember_submit(char* jobid, uint32_t nounce); +uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce); +void hashlog_purge_old(void); +void hashlog_purge_job(char* jobid); +void hashlog_purge_all(void); + struct thread_q; extern struct thread_q *tq_new(void); From de80c7e9d1448f15541d08c5dbbf372d5bfeba48 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 2 Sep 2014 12:40:44 +0200 Subject: [PATCH 10/44] blake: remove unused parameter and fix index in d_hash that reduce the speed to 92MH/s but the next commit give us 30 more so, todo: merge the whole checkhash proc in gpu_hash and remove this d_hash buffer... --- blake32.cu | 62 +++++++++++++++++------------------------------------- 1 file changed, 19 insertions(+), 43 deletions(-) diff --git a/blake32.cu b/blake32.cu index e3d0bf8..814be2d 100644 --- a/blake32.cu +++ b/blake32.cu @@ -181,10 +181,10 @@ extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char #endif __global__ -void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) +void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t* outputHash) { - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < (uint32_t) threads) { const uint32_t nounce = startNounce + thread; uint32_t /* __align__(8) */ msg[16]; @@ -223,7 +223,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) blake256_compress(h, msg, c_sigma, c_u256, 0x280); // or 0x80 - uint32_t *outHash = (uint32_t*) outputHash + thread; + uint32_t *outHash = &outputHash[thread<<3]; //#pragma unroll 8 for (int i=0; i < 8; i++) { outHash[i] = cuda_swab32(h[i]); @@ -247,40 +247,30 @@ void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_ } __global__ -void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce) +void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce) { - int thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - const uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[hashPosition]; - uint32_t hash[8]; - - #pragma unroll 8 - for (int i=0; i < 8; i++) - hash[i] = inpHash[i]; - - /* to enhance ? */ - int i, rc = 1, position = -1; - for (i = 7; i >= 0; i--) { - // rc &= (hash[i] <= pTarget[i]); - if (hash[i] > pTarget[i] && position < i) { - rc = false; position = i; + uint32_t* pHash = &g_hash[thread<<3]; + for (int i = 7; i >= 0; i--) { + uint32_t hash = pHash[i]; + if (hash > pTarget[i]) { + return; } - if (hash[i] < pTarget[i] && position < i) { - rc = true; position = i; + if (hash < pTarget[i]) { + break; } } - if(rc && resNounce[0] > nounce) + uint32_t nounce = startNounce + thread; + if(resNounce[0] > nounce) resNounce[0] = nounce; } } __host__ -uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) +uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order) { const int threadsperblock = TPB; uint32_t result = 0xffffffff; @@ -292,7 +282,7 @@ uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32 size_t shared_size = 0; - gpu_check_hash_64 <<>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); + gpu_check_hash_64 <<>>(threads, startNounce, d_inputHash, d_resNounce[thr_id]); MyStreamSynchronize(NULL, order, thr_id); if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) { @@ -322,8 +312,6 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget) CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)); } -#define NULLTEST 0 - extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { @@ -339,17 +327,11 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta if (!init[thr_id]) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 48 * throughput)); // not sure for this size... + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * throughput)); /* 32 bytes x 256K Threads (to be removed soon) */ blake256_cpu_init(thr_id); init[thr_id] = true; } -#if NULLTEST - // dev test with a null buffer 0x00000... - for (int k = 0; k < 20; k++) - pdata[k] = 0; -#endif - blake256_cpu_setBlock_80(pdata, (void*)ptarget); for (int k=0; k < 20; k++) @@ -362,13 +344,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta // GPU HASH blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); -#if NULLTEST - uint32_t buf[8]; memset(buf, 0, sizeof buf); - CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost)); - CUDA_SAFE_CALL(cudaThreadSynchronize()); - //applog_hash((unsigned char*)buf); -#endif - foundNonce = cpu_check_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + foundNonce = cpu_check_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { uint32_t vhashcpu[8]; From 7e595a36ea69027c8a28023399540a761e7686c3 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 2 Sep 2014 21:13:37 +0200 Subject: [PATCH 11/44] blake: cleanup, remove d_hash buf, not in a chain host: only bencode if gpu hash was found --- Makefile.am | 3 ++ blake32.cu | 121 +++++++++++++++------------------------------------- 2 files changed, 38 insertions(+), 86 deletions(-) diff --git a/Makefile.am b/Makefile.am index c73d9d2..520dff0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -60,6 +60,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) .cu.o: $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $< +blake32.o: blake32.cu + $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=64 -o $@ -c $< + # Luffa and Echo are faster with 80 registers than 128 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $< diff --git a/blake32.cu b/blake32.cu index 814be2d..9755f93 100644 --- a/blake32.cu +++ b/blake32.cu @@ -35,11 +35,8 @@ extern int device_map[8]; extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -// shared for 8 threads of addresses (cudaMalloc) -uint32_t* d_hash[8]; - __constant__ -static uint32_t pTarget[8]; +static uint32_t c_Target[8]; __constant__ static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding) @@ -181,7 +178,7 @@ extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char #endif __global__ -void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t* outputHash) +void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t *resNounce) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < (uint32_t) threads) @@ -198,9 +195,12 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t* outputHas // ------ Close: Bytes 64 to 80 ------ -#if 0 /* __CUDA_ARCH__ >= 200 */ - __nvvm_memset((uint8_t*)(&msg[4]), 0, sizeof(msg)-16, 8); -#else + msg[0] = c_PaddedMessage80[16]; + msg[1] = c_PaddedMessage80[17]; + msg[2] = c_PaddedMessage80[18]; + msg[3] = nounce; /* our tested value */ + msg[4] = 0x80000000UL; //cuda_swab32(0x80U); + msg[5] = 0; // uchar[17 to 55] msg[6] = 0; msg[7] = 0; @@ -210,144 +210,93 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t* outputHas msg[11] = 0; msg[12] = 0; - msg[14] = 0; -#endif - msg[0] = c_PaddedMessage80[16]; - msg[1] = c_PaddedMessage80[17]; - msg[2] = c_PaddedMessage80[18]; - msg[3] = nounce; /* our tested value */ - msg[4] = 0x80000000UL; //cuda_swab32(0x80U); - msg[13] = 1; - msg[15] = 0x280; // 60-63 - - blake256_compress(h, msg, c_sigma, c_u256, 0x280); // or 0x80 - - uint32_t *outHash = &outputHash[thread<<3]; - //#pragma unroll 8 - for (int i=0; i < 8; i++) { - outHash[i] = cuda_swab32(h[i]); - } - } -} - -__host__ -void blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) -{ - const int threadsperblock = TPB; - - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - size_t shared_size = 0; + msg[14] = 0; + msg[15] = 0x280; - blake256_gpu_hash_80<<>>(threads, startNounce, d_outputHash); + blake256_compress(h, msg, c_sigma, c_u256, 0x280); - MyStreamSynchronize(NULL, order, thr_id); -} - -__global__ -void gpu_check_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t* pHash = &g_hash[thread<<3]; for (int i = 7; i >= 0; i--) { - uint32_t hash = pHash[i]; - if (hash > pTarget[i]) { + uint32_t hash = cuda_swab32(h[i]); + if (hash > c_Target[i]) { return; } - if (hash < pTarget[i]) { + if (hash < c_Target[i]) { break; } } - uint32_t nounce = startNounce + thread; + /* keep the smallest nounce, hmm... */ if(resNounce[0] > nounce) resNounce[0] = nounce; } } __host__ -uint32_t cpu_check_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order) +uint32_t blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce) { const int threadsperblock = TPB; - uint32_t result = 0xffffffff; - - cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - size_t shared_size = 0; - gpu_check_hash_64 <<>>(threads, startNounce, d_inputHash, d_resNounce[thr_id]); - MyStreamSynchronize(NULL, order, thr_id); + uint32_t result = 0xffffffffU; + cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); + + blake256_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id]); + MyStreamSynchronize(NULL, 1, thr_id); if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) { cudaThreadSynchronize(); result = *h_resNounce[thr_id]; } - return result; } -__host__ -void blake256_cpu_init(int thr_id) -{ - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t))); -} - __host__ void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget) { uint32_t PaddedMessage[32]; memcpy(PaddedMessage, pdata, 80); memset(&PaddedMessage[20], 0, 48); - - CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice)); } extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - const int throughput = TPB * 2048; - static bool init[8] = {0,0,0,0,0,0,0,0}; - uint32_t endiandata[20]; - uint32_t Htarg = ptarget[7]; + const int throughput = TPB * 2048; /* 2048 threads is the max on a 750Ti */ + static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; int rc = 0; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = Htarg = 0x00000f; + ((uint32_t*)ptarget)[7] = 0x00000f; if (!init[thr_id]) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * throughput)); /* 32 bytes x 256K Threads (to be removed soon) */ - blake256_cpu_init(thr_id); + CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t))); init[thr_id] = true; } blake256_cpu_setBlock_80(pdata, (void*)ptarget); - for (int k=0; k < 20; k++) - be32enc(&endiandata[k], pdata[k]); - do { - int order = 0; - uint32_t foundNonce; - // GPU HASH - blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - foundNonce = cpu_check_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19]); if (foundNonce != 0xffffffff) { + uint32_t endiandata[20]; uint32_t vhashcpu[8]; + uint32_t Htarg = ptarget[7]; + + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + be32enc(&endiandata[19], foundNonce); blake32hash(vhashcpu, endiandata); From 43d3e93e1a97e569ead2437f759c6b8423d30c0a Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 3 Sep 2014 09:29:51 +0200 Subject: [PATCH 12/44] blake: set a max throughput --- blake32.cu | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/blake32.cu b/blake32.cu index 9755f93..68123f8 100644 --- a/blake32.cu +++ b/blake32.cu @@ -123,6 +123,8 @@ static const uint32_t c_u256[16] = { v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ } +#define BLAKE256_ROUNDS 14 + __device__ static void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), const uint32_t *u256, const uint32_t T0, uint8_t nullt = 1) { @@ -134,7 +136,7 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con m[i] = block[i]; } - #pragma unroll 8 + //#pragma unroll 8 for(int i = 0; i < 8; i++) v[i] = h[i]; @@ -149,7 +151,7 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con v[15] = u256[7]; //#pragma unroll - for (int i = 0; i < 14; i++) { + for (int i = 0; i < BLAKE256_ROUNDS; i++) { /* column step */ GS(0, 4, 0x8, 0xC, 0); GS(1, 5, 0x9, 0xD, 2); @@ -178,10 +180,10 @@ extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char #endif __global__ -void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t *resNounce) +void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < (uint32_t) threads) + if (thread < threads) { const uint32_t nounce = startNounce + thread; uint32_t /* __align__(8) */ msg[16]; @@ -233,7 +235,7 @@ void blake256_gpu_hash_80(int threads, uint32_t startNounce, uint32_t *resNounce } __host__ -uint32_t blake256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce) +uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce) { const int threadsperblock = TPB; @@ -269,8 +271,8 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - const int throughput = TPB * 2048; /* 2048 threads is the max on a 750Ti */ static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + uint32_t throughput = min(TPB * 2048, max_nonce - first_nonce); int rc = 0; if (opt_benchmark) @@ -294,6 +296,8 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta uint32_t vhashcpu[8]; uint32_t Htarg = ptarget[7]; + applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce); + for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); From 049e57730116685755bd3ff214f0793cce7c773b Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 3 Sep 2014 09:49:14 +0200 Subject: [PATCH 13/44] tmp blake log --- blake32.cu | 15 +++++++++++++-- cpu-miner.c | 2 +- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/blake32.cu b/blake32.cu index 68123f8..a86287e 100644 --- a/blake32.cu +++ b/blake32.cu @@ -285,6 +285,14 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta init[thr_id] = true; } + if (throughput < (TPB * 2048)) + applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce); + + if (max_nonce < first_nonce) { + applog(LOG_ERR, "start=%x > end=%x !", first_nonce, max_nonce); + return 0; + } + blake256_cpu_setBlock_80(pdata, (void*)ptarget); do { @@ -312,10 +320,13 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta goto exit_scan; } else if (vhashcpu[7] > Htarg) { - applog(LOG_WARNING, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg); + applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg); + } + else if (vhashcpu[6] > ptarget[6]) { + applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]); } else { - applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce); + applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); } } diff --git a/cpu-miner.c b/cpu-miner.c index 513d4f8..a16c3b7 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -842,7 +842,7 @@ static void *miner_thread(void *userdata) int thr_id = mythr->id; struct work work; uint32_t max_nonce; - uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; + uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 2; unsigned char *scratchbuf = NULL; char s[16]; int i; From 1a4391d7ff21397a128abf031f92733a8ac47437 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 2 Sep 2014 12:40:52 +0200 Subject: [PATCH 14/44] hashlog: prevent double computing on jobs already done --- blake32.cu | 2 +- cpu-miner.c | 41 +++++++++++++++++++++++-------- hashlog.cpp | 71 ++++++++++++++++++++++++++++++++++++----------------- miner.h | 4 +++ util.c | 26 ++++++++++++++++++++ 5 files changed, 111 insertions(+), 33 deletions(-) diff --git a/blake32.cu b/blake32.cu index a86287e..b50a3ca 100644 --- a/blake32.cu +++ b/blake32.cu @@ -304,7 +304,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta uint32_t vhashcpu[8]; uint32_t Htarg = ptarget[7]; - applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce); + applog(LOG_WARNING, "throughput=%u, start=%x, max=%x, pdata=%x", throughput, first_nonce, max_nonce, pdata[0]); for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); diff --git a/cpu-miner.c b/cpu-miner.c index a16c3b7..6da1465 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -822,9 +822,11 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) pthread_mutex_unlock(&sctx->work_lock); if (opt_debug) { + char *tm = atime2str(swab32(work->data[17])); char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size); - applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x", - work->job_id, xnonce2str, swab32(work->data[17])); + applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s", + work->job_id, xnonce2str, tm); + free(tm); free(xnonce2str); } @@ -842,10 +844,9 @@ static void *miner_thread(void *userdata) int thr_id = mythr->id; struct work work; uint32_t max_nonce; - uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 2; + uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1); unsigned char *scratchbuf = NULL; char s[16]; - int i; memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized @@ -870,6 +871,7 @@ static void *miner_thread(void *userdata) unsigned long hashes_done; struct timeval tv_start, tv_end, diff; int64_t max64; + uint64_t umax64; int rc; // &work.data[19] @@ -877,13 +879,17 @@ static void *miner_thread(void *userdata) uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); if (have_stratum) { - while (time(NULL) >= g_work_time + 120) - sleep(1); + while (time(NULL) >= g_work_time + opt_scantime) + usleep(500*1000); pthread_mutex_lock(&g_work_lock); + nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); if ((*nonceptr) >= end_nonce) stratum_gen_work(&stratum, &g_work); } else { - int min_scantime = have_longpoll ? (LP_SCANTIME*3)/4 : opt_scantime; + int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; + if (!opt_quiet) + applog(LOG_DEBUG, "have_longpoll=%d, have_stratum=%d, min_scantime=%d, g_work_time=%d", + have_longpoll, have_stratum, min_scantime, g_work_time); /* obtain new work from internal workio thread */ pthread_mutex_lock(&g_work_lock); if (!have_stratum && @@ -904,7 +910,7 @@ static void *miner_thread(void *userdata) } if (memcmp(work.data, g_work.data, wcmplen)) { memcpy(&work, &g_work, sizeof(struct work)); - (*nonceptr) = 0xffffffffU / opt_n_threads * thr_id; + (*nonceptr) = 0xffffffffU / opt_n_threads * thr_id; // 0 if single thr } else (*nonceptr)++; pthread_mutex_unlock(&g_work_lock); @@ -932,10 +938,24 @@ static void *miner_thread(void *userdata) break; } } - if ((int64_t)(*nonceptr) + max64 > end_nonce) + + umax64 = (uint64_t) max64; + if (end_nonce < (umax64 + (*nonceptr))) max_nonce = end_nonce; else - max_nonce = (uint32_t)((*nonceptr) + max64); + max_nonce = umax64 + (*nonceptr); + + /* do not recompute something already scanned (and sent) ! */ + if (hashlog_already_submittted(work.job_id, 0)) { + uint32_t lastnonce = hashlog_get_last_sent(work.job_id); + if ((*nonceptr) < lastnonce && lastnonce <= max_nonce) { + applog(LOG_WARNING, "rescan of sent job? nonce=%x, last was %x", (*nonceptr), lastnonce); + max_nonce = lastnonce - 1; + } else if ((*nonceptr) == lastnonce) { + applog(LOG_WARNING, "rescan of sent job? start nonce = lastnonce"); + (*nonceptr) = lastnonce + 1; + } + } hashes_done = 0; gettimeofday(&tv_start, NULL); @@ -1051,6 +1071,7 @@ static void *miner_thread(void *userdata) } if (opt_benchmark && thr_id == opt_n_threads - 1) { double hashrate = 0.; + int i; for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++) hashrate += thr_hashrates[i]; if (i == opt_n_threads) { diff --git a/hashlog.cpp b/hashlog.cpp index ded566e..3948751 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -1,39 +1,68 @@ -#include +//#include #include #include #include "miner.h" +#define HI_DWORD(u64) ((uint32_t) (u64 >> 32)) +#define LO_DWORD(u64) ((uint32_t) u64) + static std::map tlastshares; -/** - * Purge entries after 15 minutes - */ #define LOG_PURGE_TIMEOUT 15*60 /** - * Store submitted nounces of a job + * str hex to uint32 */ -extern "C" void hashlog_remember_submit(char* jobid, uint32_t nounce) +static uint64_t hextouint(char* jobid) { char *ptr; - uint64_t njobid = (uint64_t) strtoul(jobid, &ptr, 16); - uint64_t key = (njobid << 32) + nounce; + return strtoull(jobid, &ptr, 16); +} + +/** + * Store submitted nonces of a job + */ +extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce) +{ + uint64_t njobid = hextouint(jobid); + uint64_t key = (njobid << 32) + nonce; tlastshares[key] = (uint32_t) time(NULL); } /** - * @return time of submission + * Search last submitted nonce for a job + * @return max nonce */ -extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce) +extern "C" uint32_t hashlog_get_last_sent(char* jobid) { - char *ptr; uint32_t ret = 0; - uint64_t njobid = (uint64_t) strtoul(jobid, &ptr, 16); - uint64_t key = (njobid << 32) + nounce; - std::map::iterator i = tlastshares.find(key); - if (i != tlastshares.end()) + uint64_t njobid = hextouint(jobid); + uint64_t keypfx = (njobid << 32); + std::map::iterator i = tlastshares.begin(); + while (i != tlastshares.end()) { + if ((keypfx & i->first) == keypfx && LO_DWORD(i->first) > ret) { + ret = LO_DWORD(i->first); + } + i++; + } + return ret; +} + +/** + * @return time of a job/nonce submission (or last nonce if nonce is 0) + */ +extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce) +{ + uint32_t ret = 0; + uint64_t njobid = hextouint(jobid); + uint64_t key = (njobid << 32) + nonce; + if (nonce == 0) { + // search last submitted nonce for job + ret = hashlog_get_last_sent(jobid); + } else if (tlastshares.find(key) != tlastshares.end()) { ret = (uint32_t) tlastshares[key]; + } return ret; } @@ -42,12 +71,11 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce) */ extern "C" void hashlog_purge_job(char* jobid) { - char *ptr; - uint64_t njobid = strtoul(jobid, &ptr, 16); + uint64_t njobid = hextouint(jobid); uint64_t keypfx = (njobid << 32); std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { - if ((keypfx & i->first) != 0) + if ((keypfx & i->first) == keypfx) tlastshares.erase(i); i++; } @@ -60,6 +88,7 @@ extern "C" void hashlog_purge_old(void) { int deleted = 0; uint32_t now = (uint32_t) time(NULL); + uint32_t sz = tlastshares.size(); std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { if ((now - i->second) > LOG_PURGE_TIMEOUT) { @@ -69,16 +98,14 @@ extern "C" void hashlog_purge_old(void) i++; } if (opt_debug && deleted) { - applog(LOG_DEBUG, "hashlog: %d/%d purged", - deleted, tlastshares.size()); + applog(LOG_DEBUG, "hashlog: %d/%d purged", deleted, sz); } } /** - * Reset the submitted nounce cache + * Reset the submitted nonces cache */ extern "C" void hashlog_purge_all(void) { tlastshares.clear(); } - diff --git a/miner.h b/miner.h index b986197..3100371 100644 --- a/miner.h +++ b/miner.h @@ -390,6 +390,7 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); void hashlog_remember_submit(char* jobid, uint32_t nounce); uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce); +uint32_t hashlog_get_last_sent(char* jobid); void hashlog_purge_old(void); void hashlog_purge_job(char* jobid); void hashlog_purge_all(void); @@ -405,6 +406,9 @@ extern void tq_thaw(struct thread_q *tq); void proper_exit(int reason); +size_t time2str(char* buf, time_t timer); +char* atime2str(time_t timer); + void applog_hash(unsigned char *hash); void print_hash_tests(void); diff --git a/util.c b/util.c index a9e0ae2..275abf7 100644 --- a/util.c +++ b/util.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #ifdef WIN32 #include "compat/winansi.h" @@ -1350,6 +1351,31 @@ out: return rval; } +/** + * @param buf char[9] mini + * @param time_t timer to convert + */ +size_t time2str(char* buf, time_t timer) +{ + struct tm* tm_info; + tm_info = localtime(&timer); + return strftime(buf, 19, "%H:%M:%S", tm_info); +} + +/** + * Alloc and returns time string (to be freed) + * @param time_t timer to convert + */ +char* atime2str(time_t timer) +{ + struct tm* tm_info; + char* buf = malloc(16); + memset(buf, 0, 16); + tm_info = localtime(&timer); + strftime(buf, 19, "%H:%M:%S", tm_info); + return buf; +} + /* sprintf can be used in applog */ static char* format_hash(char* buf, unsigned char *hash) { From b1f5df374db13c597cd90fd3f8f4802f6b7b5f61 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 3 Sep 2014 12:54:13 +0200 Subject: [PATCH 15/44] stratum: store server time offset in context --- cpu-miner.c | 9 ++++++--- miner.h | 2 ++ util.c | 28 ++++++++++++++++++---------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 6da1465..20bac21 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -179,7 +179,7 @@ static bool submit_old = false; bool use_syslog = false; bool use_colors = false; static bool opt_background = false; -static bool opt_quiet = false; +bool opt_quiet = false; static int opt_retries = -1; static int opt_fail_pause = 30; int opt_timeout = 270; @@ -789,7 +789,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); /* Assemble block header */ - memset(work->data, 0, 128); + memset(work->data, 0, sizeof(work->data)); work->data[0] = le32dec(sctx->job.version); for (i = 0; i < 8; i++) work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i); @@ -822,7 +822,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) pthread_mutex_unlock(&sctx->work_lock); if (opt_debug) { - char *tm = atime2str(swab32(work->data[17])); + char *tm = atime2str(swab32(work->data[17]) - sctx->srvtime_diff); char *xnonce2str = bin2hex(work->xnonce2, sctx->xnonce2_size); applog(LOG_DEBUG, "DEBUG: job_id=%s xnonce2=%s time=%s", work->job_id, xnonce2str, tm); @@ -1690,6 +1690,9 @@ int main(int argc, char *argv[]) sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); } + /* init stratum data.. */ + memset(&stratum.url, 0, sizeof(stratum)); + pthread_mutex_init(&stats_lock, NULL); pthread_mutex_init(&g_work_lock, NULL); pthread_mutex_init(&stratum.sock_lock, NULL); diff --git a/miner.h b/miner.h index 3100371..9101c61 100644 --- a/miner.h +++ b/miner.h @@ -377,6 +377,8 @@ struct stratum_ctx { size_t xnonce2_size; struct stratum_job job; pthread_mutex_t work_lock; + + int srvtime_diff; }; bool stratum_socket_full(struct stratum_ctx *sctx, int timeout); diff --git a/util.c b/util.c index 275abf7..73a1847 100644 --- a/util.c +++ b/util.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #ifdef WIN32 #include "compat/winansi.h" @@ -1012,12 +1011,13 @@ out: static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) { - const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *ntime, *nreward; + const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward; size_t coinb1_size, coinb2_size; bool clean, ret = false; int merkle_count, i; json_t *merkle_arr; unsigned char **merkle; + int ntime; job_id = json_string_value(json_array_get(params, 0)); prevhash = json_string_value(json_array_get(params, 1)); @@ -1029,16 +1029,26 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) merkle_count = json_array_size(merkle_arr); version = json_string_value(json_array_get(params, 5)); nbits = json_string_value(json_array_get(params, 6)); - ntime = json_string_value(json_array_get(params, 7)); + stime = json_string_value(json_array_get(params, 7)); clean = json_is_true(json_array_get(params, 8)); nreward = json_string_value(json_array_get(params, 9)); - if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !ntime || + if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || strlen(prevhash) != 64 || strlen(version) != 8 || - strlen(nbits) != 8 || strlen(ntime) != 8) { + strlen(nbits) != 8 || strlen(stime) != 8) { applog(LOG_ERR, "Stratum notify: invalid parameters"); goto out; } + + /* store stratum server time diff */ + hex2bin((unsigned char *)&ntime, stime, 4); + ntime = swab32(ntime) - time(0); + if (ntime > sctx->srvtime_diff) { + sctx->srvtime_diff = ntime; + if (!opt_quiet) + applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime); + } + merkle = (unsigned char**)malloc(merkle_count * sizeof(char *)); for (i = 0; i < merkle_count; i++) { const char *s = json_string_value(json_array_get(merkle_arr, i)); @@ -1079,7 +1089,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) hex2bin(sctx->job.version, version, 4); hex2bin(sctx->job.nbits, nbits, 4); - hex2bin(sctx->job.ntime, ntime, 4); + hex2bin(sctx->job.ntime, stime, 4); if(nreward != NULL) { if(strlen(nreward) == 4) @@ -1368,11 +1378,9 @@ size_t time2str(char* buf, time_t timer) */ char* atime2str(time_t timer) { - struct tm* tm_info; - char* buf = malloc(16); + char* buf = (char*) malloc(16); memset(buf, 0, 16); - tm_info = localtime(&timer); - strftime(buf, 19, "%H:%M:%S", tm_info); + time2str(buf, timer); return buf; } From 69616b37ac447ec18d9592f43489196e4c702746 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 3 Sep 2014 13:53:01 +0200 Subject: [PATCH 16/44] hashlog: prepare store of scanned range --- blake32.cu | 8 ++++++-- ccminer.vcxproj | 3 ++- cpu-miner.c | 4 +++- hashlog.cpp | 27 +++++++++++++++++++-------- miner.h | 3 ++- 5 files changed, 32 insertions(+), 13 deletions(-) diff --git a/blake32.cu b/blake32.cu index b50a3ca..2b63ccf 100644 --- a/blake32.cu +++ b/blake32.cu @@ -304,11 +304,15 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta uint32_t vhashcpu[8]; uint32_t Htarg = ptarget[7]; - applog(LOG_WARNING, "throughput=%u, start=%x, max=%x, pdata=%x", throughput, first_nonce, max_nonce, pdata[0]); - for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); + if (opt_debug && !opt_quiet) { + applog(LOG_DEBUG, "throughput=%u, start=%x, max=%x, pdata=%08x...%08x", + throughput, first_nonce, max_nonce, endiandata[0], endiandata[7]); + applog_hash((unsigned char *)pdata); + } + be32enc(&endiandata[19], foundNonce); blake32hash(vhashcpu, endiandata); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 509715b..7590d94 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -399,6 +399,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" 64 + 64 --ptxas-options=-O2 %(AdditionalOptions) %(AdditionalOptions) @@ -561,4 +562,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" - \ No newline at end of file + diff --git a/cpu-miner.c b/cpu-miner.c index 20bac21..e239081 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -494,7 +494,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) goto out; } - hashlog_remember_submit(work->job_id, nonce); + hashlog_remember_submit(work->job_id, nonce, 0); } else { @@ -834,6 +834,8 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH) diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); + else if (opt_algo == ALGO_BLAKE) + diff_to_target(work->target, sctx->job.diff / (16.0 * opt_difficulty)); else diff_to_target(work->target, sctx->job.diff / opt_difficulty); } diff --git a/hashlog.cpp b/hashlog.cpp index 3948751..0b8b574 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -7,7 +7,13 @@ #define HI_DWORD(u64) ((uint32_t) (u64 >> 32)) #define LO_DWORD(u64) ((uint32_t) u64) -static std::map tlastshares; +struct hashlog_data { + uint32_t ntime; + uint32_t scanned_from; + uint32_t scanned_to; +}; + +static std::map tlastshares; #define LOG_PURGE_TIMEOUT 15*60 @@ -23,11 +29,15 @@ static uint64_t hextouint(char* jobid) /** * Store submitted nonces of a job */ -extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce) +extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce, uint64_t range) { uint64_t njobid = hextouint(jobid); uint64_t key = (njobid << 32) + nonce; - tlastshares[key] = (uint32_t) time(NULL); + struct hashlog_data data; + data.ntime = (uint32_t) time(NULL); + data.scanned_from = LO_DWORD(range); + data.scanned_to = HI_DWORD(range); + tlastshares[key] = data; } /** @@ -39,7 +49,7 @@ extern "C" uint32_t hashlog_get_last_sent(char* jobid) uint32_t ret = 0; uint64_t njobid = hextouint(jobid); uint64_t keypfx = (njobid << 32); - std::map::iterator i = tlastshares.begin(); + std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { if ((keypfx & i->first) == keypfx && LO_DWORD(i->first) > ret) { ret = LO_DWORD(i->first); @@ -61,7 +71,8 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce) // search last submitted nonce for job ret = hashlog_get_last_sent(jobid); } else if (tlastshares.find(key) != tlastshares.end()) { - ret = (uint32_t) tlastshares[key]; + hashlog_data data = tlastshares[key]; + ret = data.ntime; } return ret; } @@ -73,7 +84,7 @@ extern "C" void hashlog_purge_job(char* jobid) { uint64_t njobid = hextouint(jobid); uint64_t keypfx = (njobid << 32); - std::map::iterator i = tlastshares.begin(); + std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { if ((keypfx & i->first) == keypfx) tlastshares.erase(i); @@ -89,9 +100,9 @@ extern "C" void hashlog_purge_old(void) int deleted = 0; uint32_t now = (uint32_t) time(NULL); uint32_t sz = tlastshares.size(); - std::map::iterator i = tlastshares.begin(); + std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { - if ((now - i->second) > LOG_PURGE_TIMEOUT) { + if ((now - i->second.ntime) > LOG_PURGE_TIMEOUT) { deleted++; tlastshares.erase(i); } diff --git a/miner.h b/miner.h index 9101c61..79e3a15 100644 --- a/miner.h +++ b/miner.h @@ -286,6 +286,7 @@ struct work_restart { extern bool opt_debug; extern bool opt_debug_rpc; +extern bool opt_quiet; extern bool opt_protocol; extern int opt_timeout; extern bool want_longpoll; @@ -390,7 +391,7 @@ bool stratum_subscribe(struct stratum_ctx *sctx); bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); -void hashlog_remember_submit(char* jobid, uint32_t nounce); +void hashlog_remember_submit(char* jobid, uint32_t nounce, uint64_t range); uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce); uint32_t hashlog_get_last_sent(char* jobid); void hashlog_purge_old(void); From 124ddee2fe804fdac6e67c60965e423a95e8a57a Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 3 Sep 2014 14:56:51 +0200 Subject: [PATCH 17/44] blake: fix of bad difficulty --- cpu-miner.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index e239081..adf3f4d 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -835,7 +835,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH) diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); else if (opt_algo == ALGO_BLAKE) - diff_to_target(work->target, sctx->job.diff / (16.0 * opt_difficulty)); + diff_to_target(work->target, sctx->job.diff / (2.0 * opt_difficulty)); else diff_to_target(work->target, sctx->job.diff / opt_difficulty); } @@ -945,7 +945,7 @@ static void *miner_thread(void *userdata) if (end_nonce < (umax64 + (*nonceptr))) max_nonce = end_nonce; else - max_nonce = umax64 + (*nonceptr); + max_nonce = (uint32_t) umax64 + (*nonceptr); /* do not recompute something already scanned (and sent) ! */ if (hashlog_already_submittted(work.job_id, 0)) { From a270adc4b6a88b93aaadbb7f9b924f8f0fbca2b0 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 3 Sep 2014 21:05:15 +0200 Subject: [PATCH 18/44] to test on windows --- cpu-miner.c | 122 +++++++++++++++++++++++++++++++++++++--------------- hashlog.cpp | 96 +++++++++++++++++++++++++++++++---------- miner.h | 4 +- util.c | 2 +- 4 files changed, 164 insertions(+), 60 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index adf3f4d..cb4d365 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -343,6 +343,9 @@ struct work { char job_id[128]; size_t xnonce2_len; unsigned char xnonce2[32]; + + uint32_t scanned_from; + uint32_t scanned_to; }; static struct work g_work; @@ -494,7 +497,8 @@ static bool submit_upstream_work(CURL *curl, struct work *work) goto out; } - hashlog_remember_submit(work->job_id, nonce, 0); + hashlog_remember_submit(work->job_id, nonce); + hashlog_remember_scan_range(work->job_id, work->scanned_from, work->scanned_to); } else { @@ -834,8 +838,6 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH) diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); - else if (opt_algo == ALGO_BLAKE) - diff_to_target(work->target, sctx->job.diff / (2.0 * opt_difficulty)); else diff_to_target(work->target, sctx->job.diff / opt_difficulty); } @@ -848,6 +850,7 @@ static void *miner_thread(void *userdata) uint32_t max_nonce; uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1); unsigned char *scratchbuf = NULL; + bool work_done = false; char s[16]; memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized @@ -871,6 +874,7 @@ static void *miner_thread(void *userdata) while (1) { unsigned long hashes_done; + uint32_t start_nonce; struct timeval tv_start, tv_end, diff; int64_t max64; uint64_t umax64; @@ -880,41 +884,51 @@ static void *miner_thread(void *userdata) int wcmplen = 76; uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); + applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr)); + if (have_stratum) { - while (time(NULL) >= g_work_time + opt_scantime) + while (time(NULL) >= (g_work_time + opt_scantime) && !work_done) usleep(500*1000); + work_done = false; pthread_mutex_lock(&g_work_lock); nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); if ((*nonceptr) >= end_nonce) stratum_gen_work(&stratum, &g_work); } else { int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; - if (!opt_quiet) - applog(LOG_DEBUG, "have_longpoll=%d, have_stratum=%d, min_scantime=%d, g_work_time=%d", - have_longpoll, have_stratum, min_scantime, g_work_time); /* obtain new work from internal workio thread */ pthread_mutex_lock(&g_work_lock); - if (!have_stratum && - (time(NULL) - g_work_time >= min_scantime || - (*nonceptr) >= end_nonce)) { + if (time(NULL) - g_work_time >= min_scantime || + (*nonceptr) >= end_nonce) { if (unlikely(!get_work(mythr, &g_work))) { applog(LOG_ERR, "work retrieval failed, exiting " "mining thread %d", mythr->id); pthread_mutex_unlock(&g_work_lock); goto out; } - g_work_time = have_stratum ? 0 : time(NULL); - } - if (have_stratum) { - pthread_mutex_unlock(&g_work_lock); - continue; + g_work_time = time(NULL); } } if (memcmp(work.data, g_work.data, wcmplen)) { + /* + applog(LOG_NOTICE, "job %s %08x work change", g_work.job_id, (*nonceptr)); + for (int n=0; n 0 && range.scanned[1] > 0) { + /* continue scan the end */ + start_nonce = range.scanned[1] + 1; + applog(LOG_WARNING, "scan the next part %x + 1", range.scanned[1]); + } else if (range.scanned[0] > 1) { + /* dont scan the beginning... make loops */ + //end_nonce = range.scanned[0] - 1; + //applog(LOG_WARNING, "scan the missing part 0 -> %x", end_nonce); + } + if (start_nonce == work.scanned_from) { + /* to prevent stales, if last was in the same range */ + applog(LOG_ERR, "detected a staled job!"); + //(*nonceptr) = end_nonce + 1; + //work_done = true; + //continue; + start_nonce = range.scanned[1] + 1; + } + } + } + umax64 = (uint64_t) max64; - if (end_nonce < (umax64 + (*nonceptr))) + if ((umax64 + start_nonce) >= end_nonce) max_nonce = end_nonce; else - max_nonce = (uint32_t) umax64 + (*nonceptr); - - /* do not recompute something already scanned (and sent) ! */ - if (hashlog_already_submittted(work.job_id, 0)) { - uint32_t lastnonce = hashlog_get_last_sent(work.job_id); - if ((*nonceptr) < lastnonce && lastnonce <= max_nonce) { - applog(LOG_WARNING, "rescan of sent job? nonce=%x, last was %x", (*nonceptr), lastnonce); - max_nonce = lastnonce - 1; - } else if ((*nonceptr) == lastnonce) { - applog(LOG_WARNING, "rescan of sent job? start nonce = lastnonce"); - (*nonceptr) = lastnonce + 1; - } - } + max_nonce = (uint32_t) umax64 + start_nonce; + + work.scanned_from = start_nonce; + (*nonceptr) = start_nonce; hashes_done = 0; gettimeofday(&tv_start, NULL); @@ -1058,6 +1097,10 @@ static void *miner_thread(void *userdata) /* record scanhash elapsed time */ gettimeofday(&tv_end, NULL); + + if (rc && opt_debug) + applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", *nonceptr, swab32(*nonceptr)); + timeval_subtract(&diff, &tv_end, &tv_start); if (diff.tv_usec || diff.tv_sec) { pthread_mutex_lock(&stats_lock); @@ -1068,7 +1111,7 @@ static void *miner_thread(void *userdata) if (!opt_quiet) { sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", 1e-3 * thr_hashrates[thr_id]); - applog(LOG_INFO, "GPU #%d: %s, %s khash/s", + applog(LOG_INFO, "GPU #%d: %s, %s kH/s", device_map[thr_id], device_name[thr_id], s); } if (opt_benchmark && thr_id == opt_n_threads - 1) { @@ -1078,10 +1121,19 @@ static void *miner_thread(void *userdata) hashrate += thr_hashrates[i]; if (i == opt_n_threads) { sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate / 1000.); - applog(LOG_NOTICE, "Total: %s khash/s", s); + applog(LOG_NOTICE, "Total: %s kH/s", s); } } + if (rc) { + work.scanned_to = *nonceptr; + } else { + work.scanned_to = max_nonce; + } + + // could be used to store speeds too.. + hashlog_remember_scan_range(work.job_id, work.scanned_from, work.scanned_to); + /* if nonce found, submit work */ if (rc && !opt_benchmark && !submit_work(mythr, &work)) break; diff --git a/hashlog.cpp b/hashlog.cpp index 0b8b574..645fc88 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -1,21 +1,23 @@ -//#include #include +#include #include #include "miner.h" #define HI_DWORD(u64) ((uint32_t) (u64 >> 32)) #define LO_DWORD(u64) ((uint32_t) u64) +#define MK_HI64(u32) (0x100000000ULL * u32) struct hashlog_data { uint32_t ntime; uint32_t scanned_from; uint32_t scanned_to; + uint32_t last_from; }; static std::map tlastshares; -#define LOG_PURGE_TIMEOUT 15*60 +#define LOG_PURGE_TIMEOUT 5*60 /** * str hex to uint32 @@ -27,32 +29,79 @@ static uint64_t hextouint(char* jobid) } /** - * Store submitted nonces of a job + * @return time of a job/nonce submission (or last nonce if nonce is 0) */ -extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce, uint64_t range) +extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce) { + uint32_t ret = 0; uint64_t njobid = hextouint(jobid); uint64_t key = (njobid << 32) + nonce; + if (nonce == 0) { + // search last submitted nonce for job + ret = hashlog_get_last_sent(jobid); + } else if (tlastshares.find(key) != tlastshares.end()) { + hashlog_data data = tlastshares[key]; + ret = data.ntime; + } + return ret; +} +/** + * Store submitted nonces of a job + */ +extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce) +{ + uint64_t njobid = hextouint(jobid); + uint64_t keyall = (njobid << 32); + uint64_t key = keyall + nonce; struct hashlog_data data; + + data = tlastshares[keyall]; data.ntime = (uint32_t) time(NULL); - data.scanned_from = LO_DWORD(range); - data.scanned_to = HI_DWORD(range); tlastshares[key] = data; } /** - * Search last submitted nonce for a job - * @return max nonce + * Update job scanned range */ -extern "C" uint32_t hashlog_get_last_sent(char* jobid) +extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, uint32_t scanned_to) { - uint32_t ret = 0; + uint64_t njobid = hextouint(jobid); + uint64_t keyall = (njobid << 32); + struct hashlog_data data; + + // global scan range of a job + data = tlastshares[keyall]; + if (hashlog_get_scan_range(jobid) == 0) { + memset(&data, 0, sizeof(data)); + } + + if (data.scanned_from == 0 || scanned_to == (data.scanned_from - 1)) + data.scanned_from = scanned_from ? scanned_from : 1; // min 1 + if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1) + data.scanned_to = scanned_to; + + data.last_from = scanned_from; + + tlastshares[keyall] = data; + applog(LOG_BLUE, "job %s range : %x %x -> %x %x (%x)", jobid, + scanned_from, scanned_to, data.scanned_from, data.scanned_to, data.ntime);/* */ +} + +/** + * Returns the range of a job + * @return uint64_t to|from + */ +extern "C" uint64_t hashlog_get_scan_range(char* jobid) +{ + uint64_t ret = 0; uint64_t njobid = hextouint(jobid); uint64_t keypfx = (njobid << 32); std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { - if ((keypfx & i->first) == keypfx && LO_DWORD(i->first) > ret) { - ret = LO_DWORD(i->first); + if ((keypfx & i->first) == keypfx) { + hashlog_data data = i->second; + ret = data.scanned_from; + ret += MK_HI64(data.scanned_to); } i++; } @@ -60,21 +109,22 @@ extern "C" uint32_t hashlog_get_last_sent(char* jobid) } /** - * @return time of a job/nonce submission (or last nonce if nonce is 0) + * Search last submitted nonce for a job + * @return max nonce */ -extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce) +extern "C" uint32_t hashlog_get_last_sent(char* jobid) { - uint32_t ret = 0; + uint32_t nonce = 0; uint64_t njobid = hextouint(jobid); - uint64_t key = (njobid << 32) + nonce; - if (nonce == 0) { - // search last submitted nonce for job - ret = hashlog_get_last_sent(jobid); - } else if (tlastshares.find(key) != tlastshares.end()) { - hashlog_data data = tlastshares[key]; - ret = data.ntime; + uint64_t keypfx = (njobid << 32); + std::map::iterator i = tlastshares.begin(); + while (i != tlastshares.end()) { + if ((keypfx & i->first) == keypfx && i->second.ntime > 0) { + nonce = LO_DWORD(i->first); + } + i++; } - return ret; + return nonce; } /** diff --git a/miner.h b/miner.h index 79e3a15..5f9e8ac 100644 --- a/miner.h +++ b/miner.h @@ -391,9 +391,11 @@ bool stratum_subscribe(struct stratum_ctx *sctx); bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); -void hashlog_remember_submit(char* jobid, uint32_t nounce, uint64_t range); +void hashlog_remember_submit(char* jobid, uint32_t nounce); +void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, uint32_t scanned_to); uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce); uint32_t hashlog_get_last_sent(char* jobid); +uint64_t hashlog_get_scan_range(char* jobid); void hashlog_purge_old(void); void hashlog_purge_job(char* jobid); void hashlog_purge_all(void); diff --git a/util.c b/util.c index 73a1847..5567459 100644 --- a/util.c +++ b/util.c @@ -559,7 +559,7 @@ bool fulltest(const uint32_t *hash, const uint32_t *target) } } - if (!rc || opt_debug) { + if (!rc && opt_debug) { uint32_t hash_be[8], target_be[8]; char *hash_str, *target_str; From 806c3e8691215d15fa1dc3f56ba5fcbb6bc21291 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 4 Sep 2014 08:55:00 +0200 Subject: [PATCH 19/44] enhance double scan checks --- blake32.cu | 19 +++++++---- cpu-miner.c | 93 ++++++++++++++++++++++++++++++++--------------------- hashlog.cpp | 89 +++++++++++++++++++++++++++++++++++++++----------- miner.h | 1 + util.c | 2 +- 5 files changed, 141 insertions(+), 63 deletions(-) diff --git a/blake32.cu b/blake32.cu index 2b63ccf..ccba68a 100644 --- a/blake32.cu +++ b/blake32.cu @@ -29,8 +29,9 @@ extern "C" void blake32hash(void *output, const void *input) #include "cuda_helper.h" // in cpu-miner.c +extern bool opt_n_threads; extern bool opt_benchmark; -extern bool opt_debug; +//extern bool opt_debug; extern int device_map[8]; extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); @@ -279,7 +280,9 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta ((uint32_t*)ptarget)[7] = 0x00000f; if (!init[thr_id]) { - CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + if (opt_n_threads > 1) { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + } CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t))); init[thr_id] = true; @@ -288,11 +291,6 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta if (throughput < (TPB * 2048)) applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce); - if (max_nonce < first_nonce) { - applog(LOG_ERR, "start=%x > end=%x !", first_nonce, max_nonce); - return 0; - } - blake256_cpu_setBlock_80(pdata, (void*)ptarget); do { @@ -340,5 +338,12 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta exit_scan: *hashes_done = pdata[19] - first_nonce + 1; + // reset the device to allow multiple instances + if (opt_n_threads == 1) { + CUDA_SAFE_CALL(cudaDeviceReset()); + init[thr_id] = false; + } + // wait proper end of all threads + cudaDeviceSynchronize(); return rc; } diff --git a/cpu-miner.c b/cpu-miner.c index cb4d365..3f772c9 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -187,7 +187,7 @@ static int opt_scantime = 5; static json_t *opt_config; static const bool opt_time = true; static sha256_algos opt_algo = ALGO_HEAVY; -static int opt_n_threads = 0; +int opt_n_threads = 0; static double opt_difficulty = 1; // CH bool opt_trust_pool = false; uint16_t opt_vote = 9999; @@ -411,11 +411,11 @@ err_out: return false; } -static void share_result(int result, const char *reason) +static int share_result(int result, const char *reason) { char s[345]; double hashrate; - int i; + int i, ret = 0; hashrate = 0.; pthread_mutex_lock(&stats_lock); @@ -434,9 +434,15 @@ static void share_result(int result, const char *reason) (result ? CL_GRN "yay!!!" : CL_RED "booooo") : (result ? "(yay!!!)" : "(booooo)")); - if (reason) { + if (reason && !opt_quiet) { applog(LOG_WARNING, "reject reason: %s", reason); + if (strncmp(reason, "low difficulty share", 20) == 0) { + opt_difficulty = (opt_difficulty * 2.0) / 3.0; + applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty); + return 0; + } } + return 1; } static bool submit_upstream_work(CURL *curl, struct work *work) @@ -472,8 +478,10 @@ static bool submit_upstream_work(CURL *curl, struct work *work) sent = hashlog_already_submittted(work->job_id, nonce); if (sent > 0) { sent = (uint32_t) time(NULL) - sent; - if (!opt_quiet) + if (!opt_quiet) { applog(LOG_WARNING, "skip submit, nonce %s was already sent %u seconds ago", noncestr, sent); + hashlog_dump_job(work->job_id); + } rc = true; goto out; } @@ -481,11 +489,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work) if (opt_algo == ALGO_HEAVY) { sprintf(s, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr, nvotestr); + rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr, nvotestr); } else { sprintf(s, "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", - rpc_user, work->job_id, xnonce2str, ntimestr, noncestr); + rpc_user, work->job_id + 8, xnonce2str, ntimestr, noncestr); } free(ntimestr); free(noncestr); @@ -528,7 +536,8 @@ static bool submit_upstream_work(CURL *curl, struct work *work) res = json_object_get(val, "result"); reason = json_object_get(val, "reject-reason"); - share_result(json_is_true(res), reason ? json_string_value(reason) : NULL); + if (!share_result(json_is_true(res), reason ? json_string_value(reason) : NULL)) + hashlog_purge_job(work->job_id); json_decref(val); } @@ -768,7 +777,9 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) pthread_mutex_lock(&sctx->work_lock); - strcpy(work->job_id, sctx->job.job_id); + // store the job ntime as high part of jobid + snprintf(work->job_id, sizeof(work->job_id), "%07x %s", + be32dec(sctx->job.ntime) & 0xfffffff, sctx->job.job_id); work->xnonce2_len = sctx->xnonce2_size; memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); @@ -884,8 +895,6 @@ static void *miner_thread(void *userdata) int wcmplen = 76; uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); - applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr)); - if (have_stratum) { while (time(NULL) >= (g_work_time + opt_scantime) && !work_done) usleep(500*1000); @@ -909,29 +918,34 @@ static void *miner_thread(void *userdata) g_work_time = time(NULL); } } - if (memcmp(work.data, g_work.data, wcmplen)) { - /* - applog(LOG_NOTICE, "job %s %08x work change", g_work.job_id, (*nonceptr)); - for (int n=0; n 0 && range.scanned[1] > 0) { + } else if (range.scanned[0] > 0 && range.scanned[1] > 0 && range.scanned[1] < 0xFFFFFFF0UL) { /* continue scan the end */ start_nonce = range.scanned[1] + 1; - applog(LOG_WARNING, "scan the next part %x + 1", range.scanned[1]); - } else if (range.scanned[0] > 1) { - /* dont scan the beginning... make loops */ - //end_nonce = range.scanned[0] - 1; - //applog(LOG_WARNING, "scan the missing part 0 -> %x", end_nonce); + //applog(LOG_DEBUG, "scan the next part %x + 1 (%x-%x)", range.scanned[1], range.scanned[0], range.scanned[1]); } - if (start_nonce == work.scanned_from) { - /* to prevent stales, if last was in the same range */ - applog(LOG_ERR, "detected a staled job!"); - //(*nonceptr) = end_nonce + 1; - //work_done = true; - //continue; - start_nonce = range.scanned[1] + 1; + + stall = (start_nonce == work.scanned_from && end_nonce == work.scanned_to); + stall |= (start_nonce == work.scanned_from && start_nonce == range.scanned[1] + 1); + stall |= (start_nonce > range.scanned[0] && start_nonce < range.scanned[1]); + + if (stall) { + if (opt_algo) + applog(LOG_DEBUG, "job done, wait for a new one..."); + work_restart[thr_id].restart = 1; + hashlog_purge_old(); + // wait a bit for a new job... + usleep(1500*1000); + (*nonceptr) = end_nonce + 1; + work_done = true; + continue; } } } diff --git a/hashlog.cpp b/hashlog.cpp index 645fc88..b069d26 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -9,10 +9,12 @@ #define MK_HI64(u32) (0x100000000ULL * u32) struct hashlog_data { - uint32_t ntime; + uint32_t tm_sent; uint32_t scanned_from; uint32_t scanned_to; uint32_t last_from; + uint32_t tm_add; + uint32_t tm_upd; }; static std::map tlastshares; @@ -41,7 +43,7 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce) ret = hashlog_get_last_sent(jobid); } else if (tlastshares.find(key) != tlastshares.end()) { hashlog_data data = tlastshares[key]; - ret = data.ntime; + ret = data.tm_sent; } return ret; } @@ -56,7 +58,9 @@ extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce) struct hashlog_data data; data = tlastshares[keyall]; - data.ntime = (uint32_t) time(NULL); + data.tm_upd = data.tm_sent = (uint32_t) time(NULL); + if (data.tm_add == 0) + data.tm_add = data.tm_upd; tlastshares[key] = data; } @@ -67,24 +71,38 @@ extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, { uint64_t njobid = hextouint(jobid); uint64_t keyall = (njobid << 32); + uint64_t range = hashlog_get_scan_range(jobid); struct hashlog_data data; // global scan range of a job data = tlastshares[keyall]; - if (hashlog_get_scan_range(jobid) == 0) { + if (range == 0) { memset(&data, 0, sizeof(data)); + } else { + // get min and max from all sent records + data.scanned_from = LO_DWORD(range); + data.scanned_to = HI_DWORD(range); } - if (data.scanned_from == 0 || scanned_to == (data.scanned_from - 1)) - data.scanned_from = scanned_from ? scanned_from : 1; // min 1 - if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1) - data.scanned_to = scanned_to; + if (data.tm_add == 0) + data.tm_add = (uint32_t) time(NULL); data.last_from = scanned_from; + if (scanned_from < scanned_to) { + if (data.scanned_from == 0) + data.scanned_from = scanned_from ? scanned_from : 1; // min 1 + else if (scanned_from < data.scanned_from) // || scanned_to == (data.scanned_from - 1) + data.scanned_from = scanned_from; + if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1) + data.scanned_to = scanned_to; + } + + data.tm_upd = (uint32_t) time(NULL); + tlastshares[keyall] = data; - applog(LOG_BLUE, "job %s range : %x %x -> %x %x (%x)", jobid, - scanned_from, scanned_to, data.scanned_from, data.scanned_to, data.ntime);/* */ + applog(LOG_BLUE, "job %s range : %x %x -> %x %x", jobid, + scanned_from, scanned_to, data.scanned_from, data.scanned_to);/* */ } /** @@ -96,15 +114,21 @@ extern "C" uint64_t hashlog_get_scan_range(char* jobid) uint64_t ret = 0; uint64_t njobid = hextouint(jobid); uint64_t keypfx = (njobid << 32); + struct hashlog_data data; + data.scanned_from = 0; + data.scanned_to = 0; std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { - if ((keypfx & i->first) == keypfx) { - hashlog_data data = i->second; - ret = data.scanned_from; - ret += MK_HI64(data.scanned_to); + if ((keypfx & i->first) == keypfx && i->second.scanned_to > ret) { + if (i->second.scanned_to > data.scanned_to) + data.scanned_to = i->second.scanned_to; + if (i->second.scanned_from < data.scanned_from || data.scanned_from == 0) + data.scanned_from = i->second.scanned_from; } i++; } + ret = data.scanned_from; + ret += MK_HI64(data.scanned_to); return ret; } @@ -119,7 +143,7 @@ extern "C" uint32_t hashlog_get_last_sent(char* jobid) uint64_t keypfx = (njobid << 32); std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { - if ((keypfx & i->first) == keypfx && i->second.ntime > 0) { + if ((keypfx & i->first) == keypfx && i->second.tm_sent > 0) { nonce = LO_DWORD(i->first); } i++; @@ -128,18 +152,25 @@ extern "C" uint32_t hashlog_get_last_sent(char* jobid) } /** - * Remove entries of a job... not used yet + * Remove entries of a job... */ extern "C" void hashlog_purge_job(char* jobid) { + int deleted = 0; uint64_t njobid = hextouint(jobid); uint64_t keypfx = (njobid << 32); + uint32_t sz = tlastshares.size(); std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { - if ((keypfx & i->first) == keypfx) + if ((keypfx & i->first) == keypfx) { + deleted++; tlastshares.erase(i); + } i++; } + if (opt_debug && deleted) { + applog(LOG_DEBUG, "hashlog: purge job %s, del %d/%d", jobid, deleted, sz); + } } /** @@ -152,7 +183,7 @@ extern "C" void hashlog_purge_old(void) uint32_t sz = tlastshares.size(); std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { - if ((now - i->second.ntime) > LOG_PURGE_TIMEOUT) { + if ((now - i->second.tm_sent) > LOG_PURGE_TIMEOUT) { deleted++; tlastshares.erase(i); } @@ -170,3 +201,25 @@ extern "C" void hashlog_purge_all(void) { tlastshares.clear(); } + + +/** + * Can be used to debug... + */ +extern "C" void hashlog_dump_job(char* jobid) +{ + int deleted = 0; + uint64_t njobid = hextouint(jobid); + uint64_t keypfx = (njobid << 32); + uint32_t sz = tlastshares.size(); + std::map::iterator i = tlastshares.begin(); + while (i != tlastshares.end()) { + if ((keypfx & i->first) == keypfx) { + applog(LOG_BLUE, "job %s range : %x %x %s added %x upd %x", jobid, + i->second.scanned_from, i->second.scanned_to, + i->second.tm_sent ? "sent" : "", + i->second.tm_add, i->second.tm_upd);/* */ + } + i++; + } +} \ No newline at end of file diff --git a/miner.h b/miner.h index 5f9e8ac..098b6d5 100644 --- a/miner.h +++ b/miner.h @@ -399,6 +399,7 @@ uint64_t hashlog_get_scan_range(char* jobid); void hashlog_purge_old(void); void hashlog_purge_job(char* jobid); void hashlog_purge_all(void); +void hashlog_dump_job(char* jobid); struct thread_q; diff --git a/util.c b/util.c index 5567459..9afc308 100644 --- a/util.c +++ b/util.c @@ -115,7 +115,7 @@ void applog(int prio, const char *fmt, ...) case LOG_WARNING: color = CL_YLW; break; case LOG_NOTICE: color = CL_WHT; break; case LOG_INFO: color = ""; break; - case LOG_DEBUG: color = ""; break; + case LOG_DEBUG: color = CL_SIL; break; case LOG_BLUE: prio = LOG_NOTICE; From 415945eb201f11ccdd04c576b374c48360ed623c Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 4 Sep 2014 11:34:45 +0200 Subject: [PATCH 20/44] Makefile: use the CUDA_CFLAGS var --- Makefile.am | 18 +++++++++--------- build.sh | 2 +- configure.ac | 4 ++-- configure.sh | 10 +++++++++- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/Makefile.am b/Makefile.am index 520dff0..875f8b1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -53,33 +53,33 @@ nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\" #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\" #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\" -nvcc_FLAGS = $(nvcc_ARCH) -I . --ptxas-options=-v --use_fast_math +nvcc_FLAGS = $(nvcc_ARCH) -I . @CUDA_CFLAGS@ nvcc_FLAGS += $(JANSSON_INCLUDES) # we're now targeting all major compute architectures within one binary. .cu.o: - $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=128 -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) --maxrregcount=128 -o $@ -c $< blake32.o: blake32.cu - $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=64 -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< # Luffa and Echo are faster with 80 registers than 128 x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu - $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< x11/cuda_x11_echo.o: x11/cuda_x11_echo.cu - $(NVCC) $(nvcc_FLAGS) @CFLAGS@ --maxrregcount=80 -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< # Shavite compiles faster with 128 regs x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu - $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ --maxrregcount=128 -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=128 -o $@ -c $< x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu - $(NVCC) $(nvcc_FLAGS) -O2 --maxrregcount=80 -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< # ABI requiring code modules quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu - $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $< JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu - $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" --maxrregcount=80 -o $@ -c $< + $(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $< diff --git a/build.sh b/build.sh index 2905734..17935f3 100755 --- a/build.sh +++ b/build.sh @@ -4,7 +4,7 @@ # export PATH="$PATH:/usr/local/cuda/bin/" -#make distclean || echo clean +make distclean || echo clean rm -f Makefile.in rm -f config.status diff --git a/configure.ac b/configure.ac index 2f52cdf..f7924d4 100644 --- a/configure.ac +++ b/configure.ac @@ -144,12 +144,12 @@ AC_ARG_WITH([cuda], if test -n "$with_cuda" then - CUDA_CFLAGS="-I$with_cuda/include" + CUDA_CFLAGS="-I$with_cuda/include $CUDA_CFLAGS" CUDA_LIBS="-lcudart" CUDA_LDFLAGS="-L$with_cuda/lib$SUFFIX" NVCC="$with_cuda/bin/nvcc" else - CUDA_CFLAGS="-I/usr/local/cuda/include" + CUDA_CFLAGS="-I/usr/local/cuda/include $CUDA_CFLAGS" CUDA_LIBS="-lcudart -static-libstdc++" CUDA_LDFLAGS="-L/usr/local/cuda/lib$SUFFIX" NVCC="nvcc" diff --git a/configure.sh b/configure.sh index c0cdd0d..142b59e 100755 --- a/configure.sh +++ b/configure.sh @@ -1 +1,9 @@ -./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda +# possible additional CUDA_CFLAGS +#-gencode=arch=compute_50,code=\"sm_50,compute_50\" +#-gencode=arch=compute_35,code=\"sm_35,compute_35\" +#-gencode=arch=compute_30,code=\"sm_30,compute_30\" + +#--ptxas-options=\"-v -dlcm=cg\"" + +CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda + From 2ebfb546a60f547b76549323f8babffeb3c6429d Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 4 Sep 2014 12:41:49 +0200 Subject: [PATCH 21/44] clean extra logs, show bloc height on new jobs --- configure.sh | 2 +- cpu-miner.c | 17 +++++++++++------ hashlog.cpp | 4 ++-- miner.h | 1 + util.c | 3 +++ 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/configure.sh b/configure.sh index 142b59e..9c8b021 100755 --- a/configure.sh +++ b/configure.sh @@ -5,5 +5,5 @@ #--ptxas-options=\"-v -dlcm=cg\"" -CUDA_CFLAGS="-O3" ./configure "CFLAGS=-O3" "CXXFLAGS=-O3" --with-cuda=/usr/local/cuda +CUDA_CFLAGS="-O2" ./configure "CFLAGS=-O2" "CXXFLAGS=-O2" --with-cuda=/usr/local/cuda diff --git a/cpu-miner.c b/cpu-miner.c index 3f772c9..d0dc418 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -918,9 +918,9 @@ static void *miner_thread(void *userdata) g_work_time = time(NULL); } } - if (memcmp(work.data, g_work.data, 72)) { // wcmplen)) { + if (memcmp(work.data, g_work.data, wcmplen)) { if (opt_debug) { - applog(LOG_DEBUG, "job %s %08x work updated", g_work.job_id, (*nonceptr)); + applog(LOG_DEBUG, "job %s work updated", g_work.job_id); for (int n=0; n %x %x", jobid, - scanned_from, scanned_to, data.scanned_from, data.scanned_to);/* */ +/* applog(LOG_BLUE, "job %s range : %x %x -> %x %x", jobid, + scanned_from, scanned_to, data.scanned_from, data.scanned_to); */ } /** diff --git a/miner.h b/miner.h index 098b6d5..5e36442 100644 --- a/miner.h +++ b/miner.h @@ -380,6 +380,7 @@ struct stratum_ctx { pthread_mutex_t work_lock; int srvtime_diff; + int bloc_height; }; bool stratum_socket_full(struct stratum_ctx *sctx, int timeout); diff --git a/util.c b/util.c index 9afc308..7927d18 100644 --- a/util.c +++ b/util.c @@ -1069,10 +1069,13 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) coinb2_size = strlen(coinb2) / 2; sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size + sctx->xnonce2_size + coinb2_size; + sctx->job.coinbase = (unsigned char*)realloc(sctx->job.coinbase, sctx->job.coinbase_size); sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size; hex2bin(sctx->job.coinbase, coinb1, coinb1_size); memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); + + sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + 43); if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); From 3341e0324ff083112c9e4af8d7b7112a7243b9ee Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 4 Sep 2014 16:17:11 +0200 Subject: [PATCH 22/44] blake: speed +10%, no more size conversions --- blake32.cu | 91 +++++++++++++++++++++++++---------------------------- cpu-miner.c | 2 +- 2 files changed, 44 insertions(+), 49 deletions(-) diff --git a/blake32.cu b/blake32.cu index ccba68a..e0e6814 100644 --- a/blake32.cu +++ b/blake32.cu @@ -31,44 +31,45 @@ extern "C" void blake32hash(void *output, const void *input) // in cpu-miner.c extern bool opt_n_threads; extern bool opt_benchmark; -//extern bool opt_debug; extern int device_map[8]; extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); __constant__ -static uint32_t c_Target[8]; +static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding) __constant__ -static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding) +static uint32_t __align__(32) c_Target[8]; + +#define MAXU 0xffffffffU static uint32_t *d_resNounce[8]; static uint32_t *h_resNounce[8]; __constant__ -static uint8_t c_sigma[16][16]; -const uint8_t host_sigma[16][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +static uint32_t __align__(32) c_sigma[16][16]; +/* prefer uint32_t to prevent size conversions = speed +5/10 % */ +const uint32_t host_sigma[16][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } }; __device__ __constant__ -static const uint32_t c_IV256[8] = { +static const uint32_t __align__(32) c_IV256[8] = { SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C), @@ -76,8 +77,7 @@ static const uint32_t c_IV256[8] = { }; __device__ __constant__ - -static const uint32_t c_u256[16] = { +static const uint32_t __align__(32) c_u256[16] = { SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), SPH_C32(0x13198A2E), SPH_C32(0x03707344), SPH_C32(0xA4093822), SPH_C32(0x299F31D0), @@ -112,13 +112,15 @@ static const uint32_t c_u256[16] = { } while (0) #endif -#define GS(a,b,c,d,e) { \ - v[a] += (m[sigma[i][e]] ^ u256[sigma[i][e+1]]) + v[b]; \ - v[d] = SPH_ROTR32(v[d] ^ v[a], 16); \ +#define GS(a,b,c,d,x) { \ + const uint32_t idx1 = c_sigma[i][x]; \ + const uint32_t idx2 = c_sigma[i][x+1]; \ + v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \ + v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \ v[c] += v[d]; \ v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ \ - v[a] += (m[sigma[i][e+1]] ^ u256[sigma[i][e]]) + v[b]; \ + v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \ v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \ v[c] += v[d]; \ v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ @@ -127,11 +129,13 @@ static const uint32_t c_u256[16] = { #define BLAKE256_ROUNDS 14 __device__ static -void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), const uint32_t *u256, const uint32_t T0, uint8_t nullt = 1) +void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0) { uint32_t /* __align__(8) */ v[16]; uint32_t /* __align__(8) */ m[16]; + const uint32_t* u256 = c_u256; + //#pragma unroll for (int i = 0; i < 16; ++i) { m[i] = block[i]; @@ -170,16 +174,6 @@ void blake256_compress(uint32_t *h, uint32_t *block, uint8_t ((*sigma)[16]), con h[i % 8] ^= v[i]; } -#if __CUDA_ARCH__ >= 200 -/* memory should be aligned to use __nvvm_memset */ -#if (__NV_POINTER_SIZE == 64) -# define SZCT uint64_t -#else -# define SZCT uint32_t -#endif -extern __device__ __device_builtin__ void __nvvm_memset(uint8_t *, unsigned char, SZCT, int); -#endif - __global__ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) { @@ -194,7 +188,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN for(int i=0; i<8; i++) h[i] = c_IV256[i]; - blake256_compress(h, c_PaddedMessage80, c_sigma, c_u256, 0x200); /* 512 = 0x200 */ + blake256_compress(h, c_PaddedMessage80, 0x200); /* 512 = 0x200 */ // ------ Close: Bytes 64 to 80 ------ @@ -217,7 +211,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN msg[14] = 0; msg[15] = 0x280; - blake256_compress(h, msg, c_sigma, c_u256, 0x280); + blake256_compress(h, msg, 0x280); for (int i = 7; i >= 0; i--) { uint32_t hash = cuda_swab32(h[i]); @@ -239,17 +233,18 @@ __host__ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce) { const int threadsperblock = TPB; + uint32_t result = MAXU; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); size_t shared_size = 0; - uint32_t result = 0xffffffffU; - cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); + /* Check error on Ctrl+C or kill to prevent segfaults on exit */ + if (cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)) != cudaSuccess) + return result; blake256_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id]); - MyStreamSynchronize(NULL, 1, thr_id); - + cudaDeviceSynchronize(); if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) { cudaThreadSynchronize(); result = *h_resNounce[thr_id]; @@ -258,7 +253,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce } __host__ -void blake256_cpu_setBlock_80(uint32_t *pdata, const void *ptarget) +void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) { uint32_t PaddedMessage[32]; memcpy(PaddedMessage, pdata, 80); @@ -291,7 +286,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta if (throughput < (TPB * 2048)) applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce); - blake256_cpu_setBlock_80(pdata, (void*)ptarget); + blake256_cpu_setBlock_80(pdata, ptarget); do { // GPU HASH diff --git a/cpu-miner.c b/cpu-miner.c index d0dc418..dc86a3e 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -962,7 +962,7 @@ static void *miner_thread(void *userdata) max64 = 0x1fffLL; break; case ALGO_BLAKE: - /* based on the 750Ti hashrate */ + /* based on the 750Ti hashrate (100kH) */ max64 = 0x3ffffffLL; break; default: From 746398f435a03b96651efa914e1935a2cfa63d34 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 4 Sep 2014 17:34:30 +0200 Subject: [PATCH 23/44] blake: fix reduced speed on windows, wtf --- blake32.cu | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/blake32.cu b/blake32.cu index e0e6814..8be5205 100644 --- a/blake32.cu +++ b/blake32.cu @@ -47,9 +47,16 @@ static uint32_t *d_resNounce[8]; static uint32_t *h_resNounce[8]; __constant__ -static uint32_t __align__(32) c_sigma[16][16]; +#ifdef WIN32 +/* what the fuck ! */ +static uint8_t c_sigma[16][16]; +const uint8_t host_sigma[16][16] = +#else /* prefer uint32_t to prevent size conversions = speed +5/10 % */ -const uint32_t host_sigma[16][16] = { +static uint32_t __align__(32) c_sigma[16][16]; +const uint32_t host_sigma[16][16] +#endif += { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, From 9bf927a496e5aead1d9669c58887bec67d85ed6d Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 4 Sep 2014 18:18:53 +0200 Subject: [PATCH 24/44] hashlog: fix erase while iterating exception --- hashlog.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hashlog.cpp b/hashlog.cpp index 06720b0..811fed2 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -164,9 +164,9 @@ extern "C" void hashlog_purge_job(char* jobid) while (i != tlastshares.end()) { if ((keypfx & i->first) == keypfx) { deleted++; - tlastshares.erase(i); + tlastshares.erase(i++); } - i++; + else ++i; } if (opt_debug && deleted) { applog(LOG_DEBUG, "hashlog: purge job %s, del %d/%d", jobid, deleted, sz); @@ -185,9 +185,9 @@ extern "C" void hashlog_purge_old(void) while (i != tlastshares.end()) { if ((now - i->second.tm_sent) > LOG_PURGE_TIMEOUT) { deleted++; - tlastshares.erase(i); + tlastshares.erase(i++); } - i++; + else ++i; } if (opt_debug && deleted) { applog(LOG_DEBUG, "hashlog: %d/%d purged", deleted, sz); From 033fb5745c3c4d9eae20b10690780d797a70dbcb Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 4 Sep 2014 20:04:55 +0200 Subject: [PATCH 25/44] Release v1.4 with blake --- README.md | 3 +-- hashlog.cpp | 29 +++++++++++++++-------------- miner.h | 4 ++-- util.c | 2 +- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 2a2485b..d3836eb 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ ccminer Christian Buchner's & Christian H.'s CUDA miner project Fork by tpruvot@github with X14,X15,X17,WHIRL and Blake256 support + BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer) @@ -26,8 +27,6 @@ This project requires some libraries to be built : - pthreads -- [mpir math library](http://www.mpir.org) - You can download prebuilt .lib and dll on the [bitcointalk forum thread](https://bitcointalk.org/?topic=167229.0) diff --git a/hashlog.cpp b/hashlog.cpp index 811fed2..7a679e8 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -202,24 +202,25 @@ extern "C" void hashlog_purge_all(void) tlastshares.clear(); } - /** - * Can be used to debug... + * Used to debug ranges... */ extern "C" void hashlog_dump_job(char* jobid) { - int deleted = 0; - uint64_t njobid = hextouint(jobid); - uint64_t keypfx = (njobid << 32); - uint32_t sz = tlastshares.size(); - std::map::iterator i = tlastshares.begin(); - while (i != tlastshares.end()) { - if ((keypfx & i->first) == keypfx) { - applog(LOG_BLUE, "job %s range : %x %x %s added %x upd %x", jobid, - i->second.scanned_from, i->second.scanned_to, - i->second.tm_sent ? "sent" : "", - i->second.tm_add, i->second.tm_upd);/* */ + if (opt_debug) { + int deleted = 0; + uint64_t njobid = hextouint(jobid); + uint64_t keypfx = (njobid << 32); + uint32_t sz = tlastshares.size(); + std::map::iterator i = tlastshares.begin(); + while (i != tlastshares.end()) { + if ((keypfx & i->first) == keypfx) { + applog(LOG_BLUE, "job %s range : %x %x %s added %x upd %x", jobid, + i->second.scanned_from, i->second.scanned_to, + i->second.tm_sent ? "sent" : "", + i->second.tm_add, i->second.tm_upd);/* */ + } + i++; } - i++; } } \ No newline at end of file diff --git a/miner.h b/miner.h index 5e36442..6ce4ca8 100644 --- a/miner.h +++ b/miner.h @@ -317,7 +317,7 @@ extern uint16_t opt_vote; #define CL_BLK "\x1B[22;30m" /* black */ #define CL_RD2 "\x1B[22;31m" /* red */ #define CL_GR2 "\x1B[22;32m" /* green */ -#define CL_BRW "\x1B[22;33m" /* brown */ +#define CL_YL2 "\x1B[22;33m" /* dark yellow */ #define CL_BL2 "\x1B[22;34m" /* blue */ #define CL_MA2 "\x1B[22;35m" /* magenta */ #define CL_CY2 "\x1B[22;36m" /* cyan */ @@ -326,7 +326,7 @@ extern uint16_t opt_vote; #define CL_GRY "\x1B[01;30m" /* dark gray */ #define CL_LRD "\x1B[01;31m" /* light red */ #define CL_LGR "\x1B[01;32m" /* light green */ -#define CL_YL2 "\x1B[01;33m" /* yellow */ +#define CL_LYL "\x1B[01;33m" /* tooltips */ #define CL_LBL "\x1B[01;34m" /* light blue */ #define CL_LMA "\x1B[01;35m" /* light magenta */ #define CL_LCY "\x1B[01;36m" /* light cyan */ diff --git a/util.c b/util.c index 7927d18..f451d95 100644 --- a/util.c +++ b/util.c @@ -115,7 +115,7 @@ void applog(int prio, const char *fmt, ...) case LOG_WARNING: color = CL_YLW; break; case LOG_NOTICE: color = CL_WHT; break; case LOG_INFO: color = ""; break; - case LOG_DEBUG: color = CL_SIL; break; + case LOG_DEBUG: color = CL_GRY; break; case LOG_BLUE: prio = LOG_NOTICE; From e1159629b4e34e3b4e23c15a6acb610fda1c5677 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 5 Sep 2014 09:46:45 +0200 Subject: [PATCH 26/44] blake: typo for windows on last commit --- README.txt | 32 +++++++++++++++----------------- blake32.cu | 2 +- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/README.txt b/README.txt index e6fe248..15c7c72 100644 --- a/README.txt +++ b/README.txt @@ -1,27 +1,24 @@ -ccMiner release 1.3-tpruvot (Aug 21th 2014) - "X14 X15 Fresh" -------------------------------------------------------------- +ccMiner release 1.4-tpruvot (Sept 04th 2014) - "X17 Blake NEOS" +--------------------------------------------------------------- *************************************************************** If you find this tool useful and like to support its continued development, then consider a donation. - LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm - BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM - YAC donation address: Y87sptDEcpLkLeAuex6qZioDbvy1qXZEj4 - VTC donation address: VrjeFzMgvteCGarLw85KivBzmsiH9fqp4a - MAX donation address: mHrhQP9EFArechWxTFJ97s9D3jvcCvEEnt - DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP - HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN - GRS donation address: FmJKJAhvyHWPeEVeLQHefr2naqgWc9ABTM - MYR donation address: MNHM7Q7HVfGpKDJgVJrY8ofwvmeugNewyf - JPC donation address: JYFBypVDkk583yKWY4M46TG5vXG8hfgD2U - SFR donation address: SR4b87aEnPfTs77bo9NnnaV21fiF6jQpAp - MNC donation address: MShgNUSYwybEbXLvJUtdNg1a7rUeiNgooK - BTQ donation address: 13GFwLiZL2DaA9XeE733PNrQX5QYLFsonS - tpruvot@github: - BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo + BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo + DRK : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3 + NEO$ : NaEcVrdzoCWHUYXb7X8QoafoKS9UV69Yk4 + +DJM34: + XCN donation address: CNh6F4h1byX7vvbmfQn4LMtsC4TYb8mgmn + BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze + TAC donation address: TuqNvPoQxghHfzwnPpAxSTiYoN6FM8LM5p + +cbuchner v1.2: + LTC donation address: LKS1WDKGED647msBQfLBHV3Ls8sveGncnm + BTC donation address: 16hJF5mceSojnTD3ZTUDqdRhDyPJzoRakM *************************************************************** @@ -36,6 +33,7 @@ JackpotCoin QuarkCoin family & AnimeCoin TalkCoin DarkCoin and other X11 coins +NEOS blake (256 14-rounds) where some of these coins have a VERY NOTABLE nVidia advantage over competing AMD (OpenCL) implementations. diff --git a/blake32.cu b/blake32.cu index 8be5205..b24c5f7 100644 --- a/blake32.cu +++ b/blake32.cu @@ -50,7 +50,7 @@ __constant__ #ifdef WIN32 /* what the fuck ! */ static uint8_t c_sigma[16][16]; -const uint8_t host_sigma[16][16] = +const uint8_t host_sigma[16][16] #else /* prefer uint32_t to prevent size conversions = speed +5/10 % */ static uint32_t __align__(32) c_sigma[16][16]; From 416f7f3708ec0d6f46a16f457f8a223c3371bfe4 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 5 Sep 2014 18:16:40 +0200 Subject: [PATCH 27/44] hashlog: keep compat with VS2012 --- hashlog.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hashlog.cpp b/hashlog.cpp index 7a679e8..aad202e 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -27,7 +27,8 @@ static std::map tlastshares; static uint64_t hextouint(char* jobid) { char *ptr; - return strtoull(jobid, &ptr, 16); + /* dont use strtoull(), only since VS2013 */ + return (uint64_t) strtoul(jobid, &ptr, 16); } /** @@ -38,6 +39,7 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce) uint32_t ret = 0; uint64_t njobid = hextouint(jobid); uint64_t key = (njobid << 32) + nonce; + if (nonce == 0) { // search last submitted nonce for job ret = hashlog_get_last_sent(jobid); @@ -55,7 +57,7 @@ extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce) uint64_t njobid = hextouint(jobid); uint64_t keyall = (njobid << 32); uint64_t key = keyall + nonce; - struct hashlog_data data; + hashlog_data data; data = tlastshares[keyall]; data.tm_upd = data.tm_sent = (uint32_t) time(NULL); @@ -72,7 +74,7 @@ extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, uint64_t njobid = hextouint(jobid); uint64_t keyall = (njobid << 32); uint64_t range = hashlog_get_scan_range(jobid); - struct hashlog_data data; + hashlog_data data; // global scan range of a job data = tlastshares[keyall]; @@ -114,7 +116,8 @@ extern "C" uint64_t hashlog_get_scan_range(char* jobid) uint64_t ret = 0; uint64_t njobid = hextouint(jobid); uint64_t keypfx = (njobid << 32); - struct hashlog_data data; + hashlog_data data; + data.scanned_from = 0; data.scanned_to = 0; std::map::iterator i = tlastshares.begin(); From 5682b7d241a17890273488b4544934ac974e4b20 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 5 Sep 2014 10:52:04 +0200 Subject: [PATCH 28/44] blake: add also blakecoin (8-rounds) variant --- blake32.cu | 33 ++++++++++++++++++--------------- ccminer.vcxproj | 5 +++-- cpu-miner.c | 13 +++++++++++-- miner.h | 8 ++++---- sph/blake.c | 4 +++- sph/sph_blake.h | 5 +++++ util.c | 8 ++++++-- 7 files changed, 50 insertions(+), 26 deletions(-) diff --git a/blake32.cu b/blake32.cu index b24c5f7..877c319 100644 --- a/blake32.cu +++ b/blake32.cu @@ -15,11 +15,17 @@ extern "C" { /* threads per block */ #define TPB 128 +extern "C" int blake256_rounds = 14; + /* hash by cpu with blake 256 */ -extern "C" void blake32hash(void *output, const void *input) +extern "C" void blake256hash(void *output, const void *input, int rounds = 14) { unsigned char hash[64]; sph_blake256_context ctx; + + /* in sph_blake.c */ + blake256_rounds = rounds; + sph_blake256_init(&ctx); sph_blake256(&ctx, input, 80); sph_blake256_close(&ctx, hash); @@ -133,10 +139,8 @@ static const uint32_t __align__(32) c_u256[16] = { v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ } -#define BLAKE256_ROUNDS 14 - __device__ static -void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0) +void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, int blakerounds) { uint32_t /* __align__(8) */ v[16]; uint32_t /* __align__(8) */ m[16]; @@ -162,8 +166,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0) v[14] = u256[6]; v[15] = u256[7]; - //#pragma unroll - for (int i = 0; i < BLAKE256_ROUNDS; i++) { + for (int i = 0; i < blakerounds; i++) { /* column step */ GS(0, 4, 0x8, 0xC, 0); GS(1, 5, 0x9, 0xD, 2); @@ -182,7 +185,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0) } __global__ -void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) +void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, int blakerounds) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -195,7 +198,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN for(int i=0; i<8; i++) h[i] = c_IV256[i]; - blake256_compress(h, c_PaddedMessage80, 0x200); /* 512 = 0x200 */ + blake256_compress(h, c_PaddedMessage80, 0x200, blakerounds); /* 512 = 0x200 */ // ------ Close: Bytes 64 to 80 ------ @@ -218,7 +221,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN msg[14] = 0; msg[15] = 0x280; - blake256_compress(h, msg, 0x280); + blake256_compress(h, msg, 0x280, blakerounds); for (int i = 7; i >= 0; i--) { uint32_t hash = cuda_swab32(h[i]); @@ -237,7 +240,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN } __host__ -uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce) +uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, int blakerounds) { const int threadsperblock = TPB; uint32_t result = MAXU; @@ -250,7 +253,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce if (cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)) != cudaSuccess) return result; - blake256_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id]); + blake256_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id], blakerounds); cudaDeviceSynchronize(); if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) { cudaThreadSynchronize(); @@ -270,8 +273,8 @@ void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice)); } -extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *ptarget, - uint32_t max_nonce, unsigned long *hashes_done) +extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done, uint32_t blakerounds=14) { const uint32_t first_nonce = pdata[19]; static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -297,7 +300,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta do { // GPU HASH - uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19]); + uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], blakerounds); if (foundNonce != 0xffffffff) { uint32_t endiandata[20]; @@ -315,7 +318,7 @@ extern "C" int scanhash_blake32(int thr_id, uint32_t *pdata, const uint32_t *pta be32enc(&endiandata[19], foundNonce); - blake32hash(vhashcpu, endiandata); + blake256hash(vhashcpu, endiandata, blakerounds); if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 7590d94..cb633ad 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -400,8 +400,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" 64 - --ptxas-options=-O2 %(AdditionalOptions) + --ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions) %(AdditionalOptions) + true --ptxas-options=-O2 %(AdditionalOptions) @@ -562,4 +563,4 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" - + \ No newline at end of file diff --git a/cpu-miner.c b/cpu-miner.c index dc86a3e..e3b77e8 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -128,6 +128,7 @@ struct workio_cmd { typedef enum { ALGO_ANIME, ALGO_BLAKE, + ALGO_BLAKECOIN, ALGO_FRESH, ALGO_FUGUE256, /* Fugue256 */ ALGO_GROESTL, @@ -149,6 +150,7 @@ typedef enum { static const char *algo_names[] = { "anime", "blake", + "blakecoin", "fresh", "fugue256", "groestl", @@ -231,6 +233,7 @@ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ anime Animecoin hash\n\ blake Blake 256 (like NEOS blake)\n\ + blakecoin Old Blake 256 (8 rounds)\n\ fresh Freshcoin hash (shavite 80)\n\ fugue256 Fuguecoin hash\n\ groestl Groestlcoin hash\n\ @@ -961,6 +964,7 @@ static void *miner_thread(void *userdata) case ALGO_JACKPOT: max64 = 0x1fffLL; break; + case ALGO_BLAKECOIN: case ALGO_BLAKE: /* based on the 750Ti hashrate (100kH) */ max64 = 0x3ffffffLL; @@ -1065,9 +1069,14 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_BLAKECOIN: + rc = scanhash_blake256(thr_id, work.data, work.target, + max_nonce, &hashes_done, 8); + break; + case ALGO_BLAKE: - rc = scanhash_blake32(thr_id, work.data, work.target, - max_nonce, &hashes_done); + rc = scanhash_blake256(thr_id, work.data, work.target, + max_nonce, &hashes_done, 14); break; case ALGO_FRESH: diff --git a/miner.h b/miner.h index 6ce4ca8..0e281da 100644 --- a/miner.h +++ b/miner.h @@ -237,11 +237,11 @@ extern int scanhash_anime(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_fresh(int thr_id, uint32_t *pdata, +extern int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done); + unsigned long *hashes_done, uint32_t blakerounds); -extern int scanhash_blake32(int thr_id, uint32_t *pdata, +extern int scanhash_fresh(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -420,7 +420,7 @@ void applog_hash(unsigned char *hash); void print_hash_tests(void); void animehash(void *state, const void *input); -void blake32hash(void *output, const void *input); +void blake256hash(void *output, const void *input, int rounds); void fresh_hash(void *state, const void *input); void fugue256_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); diff --git a/sph/blake.c b/sph/blake.c index 0650b9c..ea829f0 100644 --- a/sph/blake.c +++ b/sph/blake.c @@ -548,7 +548,7 @@ static const sph_u64 CB[16] = { M[0xD] = sph_dec32be_aligned(buf + 52); \ M[0xE] = sph_dec32be_aligned(buf + 56); \ M[0xF] = sph_dec32be_aligned(buf + 60); \ - for (r = 0; r < 14; r ++) \ + for (r = 0; r < blake256_rounds; r ++) \ ROUND_S(r); \ H0 ^= S0 ^ V0 ^ V8; \ H1 ^= S1 ^ V1 ^ V9; \ @@ -592,6 +592,7 @@ static const sph_u64 CB[16] = { M6 = sph_dec32be_aligned(buf + 24); \ M7 = sph_dec32be_aligned(buf + 28); \ M8 = sph_dec32be_aligned(buf + 32); \ + if (blake256_rounds == 14) { \ M9 = sph_dec32be_aligned(buf + 36); \ MA = sph_dec32be_aligned(buf + 40); \ MB = sph_dec32be_aligned(buf + 44); \ @@ -599,6 +600,7 @@ static const sph_u64 CB[16] = { MD = sph_dec32be_aligned(buf + 52); \ ME = sph_dec32be_aligned(buf + 56); \ MF = sph_dec32be_aligned(buf + 60); \ + } \ ROUND_S(0); \ ROUND_S(1); \ ROUND_S(2); \ diff --git a/sph/sph_blake.h b/sph/sph_blake.h index d8d7943..24aa89d 100644 --- a/sph/sph_blake.h +++ b/sph/sph_blake.h @@ -181,6 +181,11 @@ void sph_blake224_close(void *cc, void *dst); void sph_blake224_addbits_and_close( void *cc, unsigned ub, unsigned n, void *dst); +/** + * Switch for the number of rounds (old blake was 8) + */ +extern int blake256_rounds; + /** * Initialize a BLAKE-256 context. This process performs no memory allocation. * diff --git a/util.c b/util.c index f451d95..fe9168b 100644 --- a/util.c +++ b/util.c @@ -1042,7 +1042,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) /* store stratum server time diff */ hex2bin((unsigned char *)&ntime, stime, 4); - ntime = swab32(ntime) - time(0); + ntime = swab32(ntime) - (uint32_t) time(0); if (ntime > sctx->srvtime_diff) { sctx->srvtime_diff = ntime; if (!opt_quiet) @@ -1420,7 +1420,11 @@ void print_hash_tests(void) printpfx("anime", hash); memset(hash, 0, sizeof hash); - blake32hash(&hash[0], &buf[0]); + blake256hash(&hash[0], &buf[0], 8); + printpfx("blakecoin", hash); + + memset(hash, 0, sizeof hash); + blake256hash(&hash[0], &buf[0], 14); printpfx("blake", hash); memset(hash, 0, sizeof hash); From 12fefe5de0362b46155627788e3cfbf28e5a8c4a Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 5 Sep 2014 18:17:11 +0200 Subject: [PATCH 29/44] blake: add a few more MH/s, prepare blakecoin --- blake32.cu | 97 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 46 deletions(-) diff --git a/blake32.cu b/blake32.cu index 877c319..a592b4d 100644 --- a/blake32.cu +++ b/blake32.cu @@ -42,10 +42,10 @@ extern int device_map[8]; extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); __constant__ -static uint32_t __align__(32) c_PaddedMessage80[32]; // padded message (80 bytes + padding) +static uint32_t __align__(32) c_Target[8]; __constant__ -static uint32_t __align__(32) c_Target[8]; +static uint32_t __align__(32) c_data[20]; #define MAXU 0xffffffffU @@ -128,50 +128,70 @@ static const uint32_t __align__(32) c_u256[16] = { #define GS(a,b,c,d,x) { \ const uint32_t idx1 = c_sigma[i][x]; \ const uint32_t idx2 = c_sigma[i][x+1]; \ - v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \ + v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \ v[c] += v[d]; \ v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ \ - v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \ + v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \ v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \ v[c] += v[d]; \ v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ } +/* Second part (64-80) msg never change, store it */ +__device__ __constant__ +static const uint32_t __align__(32) c_Padding[16] = { + 0, 0, 0, 0, + 0x80000000UL, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640, +}; + __device__ static void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, int blakerounds) { - uint32_t /* __align__(8) */ v[16]; uint32_t /* __align__(8) */ m[16]; - const uint32_t* u256 = c_u256; + m[0] = block[0]; + m[1] = block[1]; + m[2] = block[2]; + m[3] = block[3]; - //#pragma unroll - for (int i = 0; i < 16; ++i) { - m[i] = block[i]; + if (T0 == 0x200) { + //#pragma unroll 12 + for (int i = 4; i < 16; ++i) { + m[i] = block[i]; + } + } else { + //#pragma unroll 12 + for (int i = 4; i < 16; ++i) { + m[i] = c_Padding[i]; + } } + uint32_t /* __align__(8) */ v[16]; + //#pragma unroll 8 for(int i = 0; i < 8; i++) v[i] = h[i]; - v[ 8] = u256[0]; - v[ 9] = u256[1]; - v[10] = u256[2]; - v[11] = u256[3]; + v[ 8] = c_u256[0]; + v[ 9] = c_u256[1]; + v[10] = c_u256[2]; + v[11] = c_u256[3]; - v[12] = u256[4] ^ T0; - v[13] = u256[5] ^ T0; - v[14] = u256[6]; - v[15] = u256[7]; + v[12] = c_u256[4] ^ T0; + v[13] = c_u256[5] ^ T0; + v[14] = c_u256[6]; + v[15] = c_u256[7]; for (int i = 0; i < blakerounds; i++) { /* column step */ - GS(0, 4, 0x8, 0xC, 0); - GS(1, 5, 0x9, 0xD, 2); - GS(2, 6, 0xA, 0xE, 4); - GS(3, 7, 0xB, 0xF, 6); + GS(0, 4, 0x8, 0xC, 0x0); + GS(1, 5, 0x9, 0xD, 0x2); + GS(2, 6, 0xA, 0xE, 0x4); + GS(3, 7, 0xB, 0xF, 0x6); /* diagonal step */ GS(0, 5, 0xA, 0xF, 0x8); GS(1, 6, 0xB, 0xC, 0xA); @@ -191,37 +211,23 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN if (thread < threads) { const uint32_t nounce = startNounce + thread; - uint32_t /* __align__(8) */ msg[16]; uint32_t h[8]; #pragma unroll for(int i=0; i<8; i++) h[i] = c_IV256[i]; - blake256_compress(h, c_PaddedMessage80, 0x200, blakerounds); /* 512 = 0x200 */ + blake256_compress(h, c_data, 512, blakerounds); // ------ Close: Bytes 64 to 80 ------ - msg[0] = c_PaddedMessage80[16]; - msg[1] = c_PaddedMessage80[17]; - msg[2] = c_PaddedMessage80[18]; - msg[3] = nounce; /* our tested value */ - msg[4] = 0x80000000UL; //cuda_swab32(0x80U); - - msg[5] = 0; // uchar[17 to 55] - msg[6] = 0; - msg[7] = 0; - msg[8] = 0; - msg[9] = 0; - msg[10] = 0; - msg[11] = 0; - msg[12] = 0; - - msg[13] = 1; - msg[14] = 0; - msg[15] = 0x280; + uint32_t ending[4]; + ending[0] = c_data[16]; + ending[1] = c_data[17]; + ending[2] = c_data[18]; + ending[3] = nounce; /* our tested value */ - blake256_compress(h, msg, 0x280, blakerounds); + blake256_compress(h, ending, 640, blakerounds); for (int i = 7; i >= 0; i--) { uint32_t hash = cuda_swab32(h[i]); @@ -265,10 +271,9 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce __host__ void blake256_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) { - uint32_t PaddedMessage[32]; - memcpy(PaddedMessage, pdata, 80); - memset(&PaddedMessage[20], 0, 48); - CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)); + uint32_t data[20]; + memcpy(data, pdata, 80); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice)); } From 3356e6f8bfba816874d0a897ed9f6374bb069ac4 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 5 Sep 2014 20:00:07 +0200 Subject: [PATCH 30/44] blake: some more KH/s on linux --- blake32.cu | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/blake32.cu b/blake32.cu index a592b4d..a1403c8 100644 --- a/blake32.cu +++ b/blake32.cu @@ -315,12 +315,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); - if (opt_debug && !opt_quiet) { - applog(LOG_DEBUG, "throughput=%u, start=%x, max=%x, pdata=%08x...%08x", - throughput, first_nonce, max_nonce, endiandata[0], endiandata[7]); - applog_hash((unsigned char *)pdata); - } - be32enc(&endiandata[19], foundNonce); blake256hash(vhashcpu, endiandata, blakerounds); @@ -348,11 +342,14 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt exit_scan: *hashes_done = pdata[19] - first_nonce + 1; - // reset the device to allow multiple instances +#if 0 + /* reset the device to allow multiple instances + * could be made in cpu-miner... check later if required */ if (opt_n_threads == 1) { CUDA_SAFE_CALL(cudaDeviceReset()); init[thr_id] = false; } +#endif // wait proper end of all threads cudaDeviceSynchronize(); return rc; From b98239ec2a75108d7c4e71ebff2af50629cbf9f2 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 5 Sep 2014 20:00:48 +0200 Subject: [PATCH 31/44] hashlog: enhance scan range store and debug dump --- cpu-miner.c | 3 +-- hashlog.cpp | 33 +++++++++++++++++++++------------ miner.h | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index e3b77e8..9110d74 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -508,8 +508,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) goto out; } - hashlog_remember_submit(work->job_id, nonce); - hashlog_remember_scan_range(work->job_id, work->scanned_from, work->scanned_to); + hashlog_remember_submit(work->job_id, nonce, work->scanned_from); } else { diff --git a/hashlog.cpp b/hashlog.cpp index aad202e..9bcd04b 100644 --- a/hashlog.cpp +++ b/hashlog.cpp @@ -1,3 +1,11 @@ +/** + * Hash log of submitted job nonces + * Prevent duplicate shares and could be used for RPC stats later + * + * Note: this source is C++ (requires std::map) + * + * tpruvot@github 2014 + */ #include #include #include @@ -52,17 +60,17 @@ extern "C" uint32_t hashlog_already_submittted(char* jobid, uint32_t nonce) /** * Store submitted nonces of a job */ -extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce) +extern "C" void hashlog_remember_submit(char* jobid, uint32_t nonce, uint32_t scanned_from) { uint64_t njobid = hextouint(jobid); uint64_t keyall = (njobid << 32); uint64_t key = keyall + nonce; hashlog_data data; - data = tlastshares[keyall]; - data.tm_upd = data.tm_sent = (uint32_t) time(NULL); - if (data.tm_add == 0) - data.tm_add = data.tm_upd; + memset(&data, 0, sizeof(data)); + data.scanned_from = scanned_from; + data.scanned_to = nonce; + data.tm_add = data.tm_upd = data.tm_sent = (uint32_t) time(NULL); tlastshares[key] = data; } @@ -92,12 +100,12 @@ extern "C" void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, data.last_from = scanned_from; if (scanned_from < scanned_to) { + if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1) + data.scanned_to = scanned_to; if (data.scanned_from == 0) data.scanned_from = scanned_from ? scanned_from : 1; // min 1 - else if (scanned_from < data.scanned_from) // || scanned_to == (data.scanned_from - 1) + else if (scanned_from < data.scanned_from || scanned_to == (data.scanned_from - 1)) data.scanned_from = scanned_from; - if (data.scanned_to == 0 || scanned_from == data.scanned_to + 1) - data.scanned_to = scanned_to; } data.tm_upd = (uint32_t) time(NULL); @@ -218,10 +226,11 @@ extern "C" void hashlog_dump_job(char* jobid) std::map::iterator i = tlastshares.begin(); while (i != tlastshares.end()) { if ((keypfx & i->first) == keypfx) { - applog(LOG_BLUE, "job %s range : %x %x %s added %x upd %x", jobid, - i->second.scanned_from, i->second.scanned_to, - i->second.tm_sent ? "sent" : "", - i->second.tm_add, i->second.tm_upd);/* */ + if (i->first != keypfx) + applog(LOG_DEBUG, CL_YLW "job %s, found %08x ", jobid, LO_DWORD(i->first)); + else + applog(LOG_DEBUG, CL_YLW "job %s scanned range : %08x-%08x", jobid, + i->second.scanned_from, i->second.scanned_to); } i++; } diff --git a/miner.h b/miner.h index 0e281da..d9951e9 100644 --- a/miner.h +++ b/miner.h @@ -392,7 +392,7 @@ bool stratum_subscribe(struct stratum_ctx *sctx); bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); -void hashlog_remember_submit(char* jobid, uint32_t nounce); +void hashlog_remember_submit(char* jobid, uint32_t nounce, uint32_t scanned_from); void hashlog_remember_scan_range(char* jobid, uint32_t scanned_from, uint32_t scanned_to); uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce); uint32_t hashlog_get_last_sent(char* jobid); From ecc86af102d0f8816811184b65566c9eeca32adf Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 5 Sep 2014 21:12:38 +0200 Subject: [PATCH 32/44] blake: sometimes faster, or not --- blake32.cu | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/blake32.cu b/blake32.cu index a1403c8..638cfbe 100644 --- a/blake32.cu +++ b/blake32.cu @@ -34,6 +34,8 @@ extern "C" void blake256hash(void *output, const void *input, int rounds = 14) #include "cuda_helper.h" +#define MAXU 0xffffffffU + // in cpu-miner.c extern bool opt_n_threads; extern bool opt_benchmark; @@ -47,22 +49,13 @@ static uint32_t __align__(32) c_Target[8]; __constant__ static uint32_t __align__(32) c_data[20]; -#define MAXU 0xffffffffU - static uint32_t *d_resNounce[8]; static uint32_t *h_resNounce[8]; -__constant__ -#ifdef WIN32 -/* what the fuck ! */ -static uint8_t c_sigma[16][16]; -const uint8_t host_sigma[16][16] -#else /* prefer uint32_t to prevent size conversions = speed +5/10 % */ +__constant__ static uint32_t __align__(32) c_sigma[16][16]; -const uint32_t host_sigma[16][16] -#endif -= { +const uint32_t host_sigma[16][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, @@ -152,28 +145,19 @@ __device__ static void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, int blakerounds) { uint32_t /* __align__(8) */ m[16]; + uint32_t /* __align__(8) */ v[16]; m[0] = block[0]; m[1] = block[1]; m[2] = block[2]; m[3] = block[3]; - if (T0 == 0x200) { - //#pragma unroll 12 - for (int i = 4; i < 16; ++i) { - m[i] = block[i]; - } - } else { - //#pragma unroll 12 - for (int i = 4; i < 16; ++i) { - m[i] = c_Padding[i]; - } + for (uint32_t i = 4; i < 16; i++) { + m[i] = (T0 == 0x200) ? block[i] : c_Padding[i]; } - uint32_t /* __align__(8) */ v[16]; - //#pragma unroll 8 - for(int i = 0; i < 8; i++) + for(uint32_t i = 0; i < 8; i++) v[i] = h[i]; v[ 8] = c_u256[0]; @@ -200,8 +184,10 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, in } //#pragma unroll 16 - for(int i = 0; i < 16; i++) - h[i % 8] ^= v[i]; + for (uint32_t i = 0; i < 16; i++) { + uint32_t j = i % 8; + h[j] ^= v[i]; + } } __global__ @@ -306,13 +292,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt do { // GPU HASH uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], blakerounds); - if (foundNonce != 0xffffffff) + if (foundNonce != MAXU) { uint32_t endiandata[20]; uint32_t vhashcpu[8]; uint32_t Htarg = ptarget[7]; - for (int k=0; k < 20; k++) + for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); be32enc(&endiandata[19], foundNonce); From 52ec8830b1a9231b35bf44286a8f07d6ac0eb5b1 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Sep 2014 01:21:30 +0200 Subject: [PATCH 33/44] blake: blakecoin variant now works --- README.txt | 1 + blake32.cu | 4 ++-- configure.ac | 2 +- cpu-miner.c | 5 +++-- sph/blake.c | 4 ++-- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.txt b/README.txt index 15c7c72..ea4cd5b 100644 --- a/README.txt +++ b/README.txt @@ -62,6 +62,7 @@ its command line interface and options. quark use to mine Quarkcoin anime use to mine Animecoin blake use to mine NEOS (Blake 256) + blakecoin use to mine Old Blake 256 nist5 use to mine TalkCoin fresh use to mine Freshcoin whirl use to mine Whirlcoin diff --git a/blake32.cu b/blake32.cu index 638cfbe..2ce2acd 100644 --- a/blake32.cu +++ b/blake32.cu @@ -191,7 +191,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, in } __global__ -void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, int blakerounds) +void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int blakerounds) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -232,7 +232,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN } __host__ -uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, int blakerounds) +uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const int blakerounds) { const int threadsperblock = TPB; uint32_t result = MAXU; diff --git a/configure.ac b/configure.ac index f7924d4..2a554f1 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.09.01]) +AC_INIT([ccminer], [2014.09.06]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 9110d74..b3a6ba7 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -789,7 +789,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR) heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); else - if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_WHC) + if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_WHC || opt_algo == ALGO_BLAKECOIN) SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root); else sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); @@ -964,6 +964,7 @@ static void *miner_thread(void *userdata) max64 = 0x1fffLL; break; case ALGO_BLAKECOIN: + max64 = 0x3ffffffLL; case ALGO_BLAKE: /* based on the 750Ti hashrate (100kH) */ max64 = 0x3ffffffLL; @@ -1373,7 +1374,7 @@ out: return NULL; } -#define PROGRAM_VERSION "1.4" +#define PROGRAM_VERSION "1.4.1" static void show_version_and_exit(void) { printf("%s v%s\n" diff --git a/sph/blake.c b/sph/blake.c index ea829f0..c89de5e 100644 --- a/sph/blake.c +++ b/sph/blake.c @@ -592,7 +592,6 @@ static const sph_u64 CB[16] = { M6 = sph_dec32be_aligned(buf + 24); \ M7 = sph_dec32be_aligned(buf + 28); \ M8 = sph_dec32be_aligned(buf + 32); \ - if (blake256_rounds == 14) { \ M9 = sph_dec32be_aligned(buf + 36); \ MA = sph_dec32be_aligned(buf + 40); \ MB = sph_dec32be_aligned(buf + 44); \ @@ -600,7 +599,6 @@ static const sph_u64 CB[16] = { MD = sph_dec32be_aligned(buf + 52); \ ME = sph_dec32be_aligned(buf + 56); \ MF = sph_dec32be_aligned(buf + 60); \ - } \ ROUND_S(0); \ ROUND_S(1); \ ROUND_S(2); \ @@ -609,12 +607,14 @@ static const sph_u64 CB[16] = { ROUND_S(5); \ ROUND_S(6); \ ROUND_S(7); \ + if (blake256_rounds == 14) { \ ROUND_S(8); \ ROUND_S(9); \ ROUND_S(0); \ ROUND_S(1); \ ROUND_S(2); \ ROUND_S(3); \ + } \ H0 ^= S0 ^ V0 ^ V8; \ H1 ^= S1 ^ V1 ^ V9; \ H2 ^= S2 ^ V2 ^ VA; \ From 65909ec3b778fdba97c97146ff8900795b972526 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Sep 2014 10:55:44 +0200 Subject: [PATCH 34/44] blake: handle case when 2 hashes are found in a call --- blake32.cu | 50 +++++++++++++++++++++++++++++++++++++---------- cpu-miner.c | 2 +- cpuminer-config.h | 6 +++--- util.c | 6 ++++-- 4 files changed, 48 insertions(+), 16 deletions(-) diff --git a/blake32.cu b/blake32.cu index 2ce2acd..5013de7 100644 --- a/blake32.cu +++ b/blake32.cu @@ -52,6 +52,8 @@ static uint32_t __align__(32) c_data[20]; static uint32_t *d_resNounce[8]; static uint32_t *h_resNounce[8]; +static uint32_t extra_results[2] = { MAXU, MAXU }; + /* prefer uint32_t to prevent size conversions = speed +5/10 % */ __constant__ static uint32_t __align__(32) c_sigma[16][16]; @@ -225,9 +227,13 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN } } - /* keep the smallest nounce, hmm... */ - if(resNounce[0] > nounce) + /* keep the smallest nounce, + extra one if found */ + if (resNounce[0] > nounce) { + resNounce[1] = resNounce[0]; resNounce[0] = nounce; + } + else + resNounce[1] = nounce; } } @@ -242,14 +248,15 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce size_t shared_size = 0; /* Check error on Ctrl+C or kill to prevent segfaults on exit */ - if (cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)) != cudaSuccess) + if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess) return result; blake256_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id], blakerounds); cudaDeviceSynchronize(); - if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { cudaThreadSynchronize(); - result = *h_resNounce[thr_id]; + result = h_resNounce[thr_id][0]; + extra_results[0] = h_resNounce[thr_id][1]; } return result; } @@ -269,9 +276,20 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt { const uint32_t first_nonce = pdata[19]; static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint32_t throughput = min(TPB * 2048, max_nonce - first_nonce); + uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce); int rc = 0; + if (extra_results[0] != MAXU) { + // possible extra result found in previous call + if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) { + pdata[19] = extra_results[0]; + *hashes_done = pdata[19] - first_nonce + 1; + extra_results[0] = MAXU; + rc = 1; + goto exit_scan; + } + } + if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x00000f; @@ -279,13 +297,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt if (opt_n_threads > 1) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); } - CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t))); init[thr_id] = true; } - if (throughput < (TPB * 2048)) - applog(LOG_WARNING, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce); + if (opt_debug && throughput < (TPB * 4096)) + applog(LOG_DEBUG, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce); blake256_cpu_setBlock_80(pdata, ptarget); @@ -309,6 +327,18 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt { pdata[19] = foundNonce; rc = 1; + + if (extra_results[0] != MAXU) { + // Rare but possible if the throughput is big + be32enc(&endiandata[19], extra_results[0]); + blake256hash(vhashcpu, endiandata, blakerounds); + if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { + applog(LOG_NOTICE, "GPU found more than one result yippee!"); + } else { + extra_results[0] = MAXU; + } + } + goto exit_scan; } else if (vhashcpu[7] > Htarg) { diff --git a/cpu-miner.c b/cpu-miner.c index b3a6ba7..7f70a6e 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1005,7 +1005,7 @@ static void *miner_thread(void *userdata) work_restart[thr_id].restart = 1; hashlog_purge_old(); // wait a bit for a new job... - usleep(1500*1000); + sleep(1); (*nonceptr) = end_nonce + 1; work_done = true; continue; diff --git a/cpuminer-config.h b/cpuminer-config.h index 0fafa85..11edf82 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -156,7 +156,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 2014.09.01" +#define PACKAGE_STRING "ccminer 2014.09.06" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "ccminer" @@ -165,7 +165,7 @@ #define PACKAGE_URL "" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2014.09.01" +#define PACKAGE_VERSION "2014.09.06" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be @@ -188,7 +188,7 @@ #define USE_XOP 1 /* Version number of package */ -#define VERSION "2014.09.01" +#define VERSION "2014.09.06" /* Define curl_free() as free() if our version of curl lacks curl_free. */ /* #undef curl_free */ diff --git a/util.c b/util.c index fe9168b..b2c0b0f 100644 --- a/util.c +++ b/util.c @@ -557,6 +557,9 @@ bool fulltest(const uint32_t *hash, const uint32_t *target) rc = true; break; } + if (hash[0] == target[0]) { + applog(LOG_NOTICE, "We found an exact match!"); + } } if (!rc && opt_debug) { @@ -1122,8 +1125,7 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) sctx->next_diff = diff; pthread_mutex_unlock(&sctx->work_lock); - if (opt_debug) - applog(LOG_DEBUG, "Stratum difficulty set to %g", diff); + applog(LOG_INFO, "Stratum difficulty set to %g", diff); return true; } From 5ccd1669161e174211f916ffd2e17d2117fa6d1c Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Sep 2014 16:26:53 +0200 Subject: [PATCH 35/44] blake: introduce pdata head cache (speed x2) --- blake32.cu | 103 ++++++++++++++++++++++++++++++++++++++++++++++++---- cpu-miner.c | 10 ++--- util.c | 4 +- 3 files changed, 102 insertions(+), 15 deletions(-) diff --git a/blake32.cu b/blake32.cu index 5013de7..a0f502b 100644 --- a/blake32.cu +++ b/blake32.cu @@ -41,7 +41,7 @@ extern bool opt_n_threads; extern bool opt_benchmark; extern int device_map[8]; -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); +uint32_t crc32(const uint32_t *buf, size_t size); __constant__ static uint32_t __align__(32) c_Target[8]; @@ -51,9 +51,16 @@ static uint32_t __align__(32) c_data[20]; static uint32_t *d_resNounce[8]; static uint32_t *h_resNounce[8]; - static uint32_t extra_results[2] = { MAXU, MAXU }; +#define USE_CACHE 1 +#if USE_CACHE +__device__ +static uint32_t cache[8]; +__device__ +static uint32_t prevsum = 0; +#endif + /* prefer uint32_t to prevent size conversions = speed +5/10 % */ __constant__ static uint32_t __align__(32) c_sigma[16][16]; @@ -193,7 +200,7 @@ void blake256_compress(uint32_t *h, const uint32_t *block, const uint32_t T0, in } __global__ -void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int blakerounds) +void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce, const int blakerounds, const int crcsum) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -202,11 +209,27 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN uint32_t h[8]; #pragma unroll - for(int i=0; i<8; i++) + for(int i=0; i<8; i++) { h[i] = c_IV256[i]; + } +#if !USE_CACHE blake256_compress(h, c_data, 512, blakerounds); - +#else + if (crcsum != prevsum) { + prevsum = crcsum; + blake256_compress(h, c_data, 512, blakerounds); + #pragma unroll + for(int i=0; i<8; i++) { + cache[i] = h[i]; + } + } else { + #pragma unroll + for(int i=0; i<8; i++) { + h[i] = cache[i]; + } + } +#endif // ------ Close: Bytes 64 to 80 ------ uint32_t ending[4]; @@ -238,7 +261,7 @@ void blake256_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resN } __host__ -uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const int blakerounds) +uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, const int blakerounds, const uint32_t crcsum) { const int threadsperblock = TPB; uint32_t result = MAXU; @@ -251,7 +274,7 @@ uint32_t blake256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess) return result; - blake256_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id], blakerounds); + blake256_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id], blakerounds, crcsum); cudaDeviceSynchronize(); if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { cudaThreadSynchronize(); @@ -277,6 +300,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt const uint32_t first_nonce = pdata[19]; static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; uint32_t throughput = min(TPB * 4096, max_nonce - first_nonce); + uint32_t crcsum = MAXU; int rc = 0; if (extra_results[0] != MAXU) { @@ -306,10 +330,13 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt applog(LOG_DEBUG, "throughput=%u, start=%x, max=%x", throughput, first_nonce, max_nonce); blake256_cpu_setBlock_80(pdata, ptarget); +#if USE_CACHE + crcsum = crc32(pdata, 64); +#endif do { // GPU HASH - uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], blakerounds); + uint32_t foundNonce = blake256_cpu_hash_80(thr_id, throughput, pdata[19], blakerounds, crcsum); if (foundNonce != MAXU) { uint32_t endiandata[20]; @@ -370,3 +397,63 @@ exit_scan: cudaDeviceSynchronize(); return rc; } + +static uint32_t crc32_tab[] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, + 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, + 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, + 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, + 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, + 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, + 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, + 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, + 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, + 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, + 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, + 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, + 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, + 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, + 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, + 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, + 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, + 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d +}; + +uint32_t crc32(const uint32_t *buf, size_t size) +{ + const uint8_t *p; + uint32_t crc = 0; + + p = (uint8_t *) buf; + crc = crc ^ ~0U; + + while (size--) + crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); + + return crc ^ ~0U; +} \ No newline at end of file diff --git a/cpu-miner.c b/cpu-miner.c index 7f70a6e..6cf40d5 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -582,8 +582,8 @@ static bool get_upstream_work(CURL *curl, struct work *work) if (opt_debug && rc) { timeval_subtract(&diff, &tv_end, &tv_start); - applog(LOG_DEBUG, "DEBUG: got new work in %d ms", - diff.tv_sec * 1000 + diff.tv_usec / 1000); + applog(LOG_DEBUG, "DEBUG: got new work in %u µs", + diff.tv_sec * 1000000 + diff.tv_usec); } json_decref(val); @@ -1345,12 +1345,12 @@ static void *stratum_thread(void *userdata) pthread_mutex_unlock(&g_work_lock); if (stratum.job.clean) { if (!opt_quiet) - applog(LOG_BLUE, "%s requested %s job %d restart, block %d", short_url, algo_names[opt_algo], - strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height); + applog(LOG_BLUE, "%s send a new %s block %d", short_url, algo_names[opt_algo], + stratum.bloc_height); restart_threads(); hashlog_purge_old(); } else if (!opt_quiet) { - applog(LOG_BLUE, "%s send %s job %d, block %d", short_url, algo_names[opt_algo], + applog(LOG_BLUE, "%s send job %d for block %d", short_url, strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height); } } diff --git a/util.c b/util.c index b2c0b0f..dfe98ab 100644 --- a/util.c +++ b/util.c @@ -557,8 +557,8 @@ bool fulltest(const uint32_t *hash, const uint32_t *target) rc = true; break; } - if (hash[0] == target[0]) { - applog(LOG_NOTICE, "We found an exact match!"); + if (hash[1] == target[1]) { + applog(LOG_NOTICE, "We found a close match!"); } } From 11b04d82ffd7940e8ec2f5324cfcff57b88bbdf3 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Sep 2014 17:16:07 +0200 Subject: [PATCH 36/44] update readme, tag v1.4.1 --- README.md | 2 +- README.txt | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d3836eb..4ca9fab 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ccminer Christian Buchner's & Christian H.'s CUDA miner project -Fork by tpruvot@github with X14,X15,X17,WHIRL and Blake256 support +Fork by tpruvot@github with X14,X15,X17,WHIRL and Blake256 support (NEOS + BlakeCoin) BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer) diff --git a/README.txt b/README.txt index ea4cd5b..36a3e00 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.4-tpruvot (Sept 04th 2014) - "X17 Blake NEOS" +ccMiner release 1.4.1-tpruvot (Sep 06th 2014) - "Cached Blake" --------------------------------------------------------------- *************************************************************** @@ -34,6 +34,7 @@ QuarkCoin family & AnimeCoin TalkCoin DarkCoin and other X11 coins NEOS blake (256 14-rounds) +BlakeCoin (256 8-rounds) where some of these coins have a VERY NOTABLE nVidia advantage over competing AMD (OpenCL) implementations. From 42eafcbe85bb4594c9d0d8dacc90aed6312a4c87 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Sep 2014 19:22:35 +0200 Subject: [PATCH 37/44] Put CRC-32 function in a new unit Signed-off-by: Tanguy Pruvot --- Makefile.am | 2 +- blake32.cu | 67 ++-------------------- ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 3 + crc32.c | 119 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 128 insertions(+), 64 deletions(-) create mode 100644 crc32.c diff --git a/Makefile.am b/Makefile.am index 875f8b1..bb60063 100644 --- a/Makefile.am +++ b/Makefile.am @@ -16,7 +16,7 @@ bin_PROGRAMS = ccminer ccminer_SOURCES = elist.h miner.h compat.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/sys/time.h compat/getopt/getopt.h \ - cpu-miner.c util.c hefty1.c scrypt.c \ + cpu-miner.c util.c crc32.c hefty1.c scrypt.c \ hashlog.cpp \ heavy/heavy.cu \ heavy/cuda_blake512.cu heavy/cuda_blake512.h \ diff --git a/blake32.cu b/blake32.cu index a0f502b..f5c0f6b 100644 --- a/blake32.cu +++ b/blake32.cu @@ -15,6 +15,9 @@ extern "C" { /* threads per block */ #define TPB 128 +/* crc32.c */ +extern "C" uint32_t crc32_u32t(const uint32_t *buf, size_t size); + extern "C" int blake256_rounds = 14; /* hash by cpu with blake 256 */ @@ -41,8 +44,6 @@ extern bool opt_n_threads; extern bool opt_benchmark; extern int device_map[8]; -uint32_t crc32(const uint32_t *buf, size_t size); - __constant__ static uint32_t __align__(32) c_Target[8]; @@ -331,7 +332,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt blake256_cpu_setBlock_80(pdata, ptarget); #if USE_CACHE - crcsum = crc32(pdata, 64); + crcsum = crc32_u32t(pdata, 64); #endif do { @@ -397,63 +398,3 @@ exit_scan: cudaDeviceSynchronize(); return rc; } - -static uint32_t crc32_tab[] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, - 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, - 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, - 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, - 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, - 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, - 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, - 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, - 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, - 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, - 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, - 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, - 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, - 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, - 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, - 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, - 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, - 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, - 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, - 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, - 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, - 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d -}; - -uint32_t crc32(const uint32_t *buf, size_t size) -{ - const uint8_t *p; - uint32_t crc = 0; - - p = (uint8_t *) buf; - crc = crc ^ ~0U; - - while (size--) - crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); - - return crc ^ ~0U; -} \ No newline at end of file diff --git a/ccminer.vcxproj b/ccminer.vcxproj index cb633ad..0ab1d60 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -241,6 +241,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" false Full + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 93e331c..bc990e0 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -96,6 +96,9 @@ Source Files + + Source Files + Source Files diff --git a/crc32.c b/crc32.c new file mode 100644 index 0000000..f036bcb --- /dev/null +++ b/crc32.c @@ -0,0 +1,119 @@ +/*- + * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or + * code or tables extracted from it, as desired without restriction. + * + * First, the polynomial itself and its table of feedback terms. The + * polynomial is + * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0 + * + * Note that we take it "backwards" and put the highest-order term in + * the lowest-order bit. The X^32 term is "implied"; the LSB is the + * X^31 term, etc. The X^0 term (usually shown as "+1") results in + * the MSB being 1 + * + * Note that the usual hardware shift register implementation, which + * is what we're using (we're merely optimizing it by doing eight-bit + * chunks at a time) shifts bits into the lowest-order term. In our + * implementation, that means shifting towards the right. Why do we + * do it this way? Because the calculated CRC must be transmitted in + * order from highest-order term to lowest-order term. UARTs transmit + * characters in order from LSB to MSB. By storing the CRC this way + * we hand it to the UART in the order low-byte to high-byte; the UART + * sends each low-bit to hight-bit; and the result is transmission bit + * by bit from highest- to lowest-order term without requiring any bit + * shuffling on our part. Reception works similarly + * + * The feedback terms table consists of 256, 32-bit entries. Notes + * + * The table can be generated at runtime if desired; code to do so + * is shown later. It might not be obvious, but the feedback + * terms simply represent the results of eight shift/xor opera + * tions for all combinations of data and CRC register values + * + * The values must be right-shifted by eight bits by the "updcrc + * logic; the shift must be unsigned (bring in zeroes). On some + * hardware you could probably optimize the shift in assembler by + * using byte-swap instructions + * polynomial $edb88320 + * + * + * CRC32 code derived from work by Gary S. Brown. + */ + +#include +#include + +static uint32_t crc32_tab[] = { + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, + 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2, + 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, + 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c, + 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, + 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106, + 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, + 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950, + 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, + 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, + 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, + 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84, + 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, + 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e, + 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, + 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28, + 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, + 0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242, + 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, + 0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, + 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, + 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d +}; + +/* Real CRC32 Function */ +extern uint32_t crc32(uint32_t crc, const void *buf, size_t size) +{ + const uint8_t *p; + + p = buf; + crc = crc ^ ~0U; + + while (size--) + crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); + + return crc ^ ~0U; +} + +/* CRC32 Function simplified for ccminer */ +extern uint32_t crc32_u32t(const uint32_t *buf, size_t size) +{ + const uint8_t *p; + uint32_t crc = 0; + + p = (uint8_t *) buf; + crc = crc ^ ~0U; + + while (size--) + crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8); + + return crc ^ ~0U; +} From 95ac1d0f194a36695f60fe2da627a38baa21f38d Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Sep 2014 20:54:41 +0200 Subject: [PATCH 38/44] x11: adapt some blake 256 opts to 512 one blake512: for the moment 6.2ms vs 7.12 before (+10%) --- cuda_nist5.cu | 9 +- quark/cuda_quark_blake512.cu | 156 ++++++++++++++++------------------- quark/quarkcoin.cu | 31 ++----- x11/x11.cu | 18 ++-- x15/x14.cu | 4 +- x15/x15.cu | 31 +------ x17/x17.cu | 18 +--- 7 files changed, 98 insertions(+), 169 deletions(-) diff --git a/cuda_nist5.cu b/cuda_nist5.cu index 2feb32e..419c1a5 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -5,9 +5,11 @@ extern "C" #include "sph/sph_skein.h" #include "sph/sph_jh.h" #include "sph/sph_keccak.h" +} + #include "miner.h" + #include "cuda_helper.h" -} // aus cpu-miner.c extern int device_map[8]; @@ -74,9 +76,6 @@ extern "C" void nist5hash(void *state, const void *input) memcpy(state, hash, 32); } - -extern bool opt_benchmark; - extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) @@ -84,7 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t first_nonce = pdata[19]; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ((uint32_t*)ptarget)[7] = 0x00FF; const uint32_t Htarg = ptarget[7]; diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index e3d299d..787b8a0 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -50,59 +50,60 @@ const uint64_t c_u512[16] = 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL }; -#define G(a,b,c,d,e) \ - v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ - v[d] = ROTR( v[d] ^ v[a],32); \ - v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c],25); \ - v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ - v[d] = ROTR( v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c],11); - +#define G(a,b,c,d,x) { \ + uint32_t idx1 = sigma[i][x]; \ + uint32_t idx2 = sigma[i][x+1]; \ + v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \ + v[d] = ROTR( v[d] ^ v[a], 32); \ + v[c] += v[d]; \ + v[b] = ROTR( v[b] ^ v[c], 25); \ + v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \ + v[d] = ROTR( v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = ROTR( v[b] ^ v[c], 11); \ +} __device__ static -void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits ) +void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0) { uint64_t v[16], m[16], i; -#pragma unroll 16 - for( i = 0; i < 16; ++i ) { - m[i] = cuda_swab64(block[i]); - } - -#pragma unroll 8 - for( i = 0; i < 8; ++i ) v[i] = h[i]; - - v[ 8] = u512[0]; - v[ 9] = u512[1]; - v[10] = u512[2]; - v[11] = u512[3]; - v[12] = u512[4]; - v[13] = u512[5]; - v[14] = u512[6]; - v[15] = u512[7]; - - v[12] ^= bits; - v[13] ^= bits; - -//#pragma unroll 16 - for( i = 0; i < 16; ++i ) - { - /* column step */ - G( 0, 4, 8, 12, 0 ); - G( 1, 5, 9, 13, 2 ); - G( 2, 6, 10, 14, 4 ); - G( 3, 7, 11, 15, 6 ); - /* diagonal step */ - G( 0, 5, 10, 15, 8 ); - G( 1, 6, 11, 12, 10 ); - G( 2, 7, 8, 13, 12 ); - G( 3, 4, 9, 14, 14 ); - } - -#pragma unroll 16 - for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; + #pragma unroll 16 + for( i = 0; i < 16; i++) { + m[i] = cuda_swab64(block[i]); + } + + #pragma unroll 8 + for (i = 0; i < 8; i++) + v[i] = h[i]; + + v[ 8] = u512[0]; + v[ 9] = u512[1]; + v[10] = u512[2]; + v[11] = u512[3]; + v[12] = u512[4] ^ T0; + v[13] = u512[5] ^ T0; + v[14] = u512[6]; + v[15] = u512[7]; + + //#pragma unroll 16 + for( i = 0; i < 16; ++i ) + { + /* column step */ + G( 0, 4, 8, 12, 0 ); + G( 1, 5, 9, 13, 2 ); + G( 2, 6, 10, 14, 4 ); + G( 3, 7, 11, 15, 6 ); + /* diagonal step */ + G( 0, 5, 10, 15, 8 ); + G( 1, 6, 11, 12, 10 ); + G( 2, 7, 8, 13, 12 ); + G( 3, 4, 9, 14, 14 ); + } + + #pragma unroll 16 + for( i = 0; i < 16; ++i ) + h[i % 8] ^= v[i]; } __device__ __constant__ @@ -114,7 +115,8 @@ static const uint64_t d_constMem[8] = { 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x1f83d9abfb41bd6bULL, - 0x5be0cd19137e2179ULL }; + 0x5be0cd19137e2179ULL +}; // Hash-Padding __device__ __constant__ @@ -126,7 +128,8 @@ static const uint64_t d_constHashPadding[8] = { 0, 0x0100000000000000ull, 0, - 0x0002000000000000ull }; + 0x0002000000000000ull +}; __global__ __launch_bounds__(256, 4) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) @@ -145,48 +148,42 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n if (thread < threads) #endif { - uint8_t i; - // bestimme den aktuellen Zähler uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8 - // 128 Byte für die Message + // 128 Bytes uint64_t buf[16]; - // State vorbereiten + // State uint64_t h[8]; #pragma unroll 8 - for (i=0;i<8;i++) + for (int i=0;i<8;i++) h[i] = d_constMem[i]; - // Message für die erste Runde in Register holen + // Message for first round #pragma unroll 8 - for (i=0; i < 8; ++i) + for (int i=0; i < 8; ++i) buf[i] = inpHash[i]; #pragma unroll 8 - for (i=0; i < 8; i++) + for (int i=0; i < 8; i++) buf[i+8] = d_constHashPadding[i]; - // die einzige Hashing-Runde + // Ending round quark_blake512_compress( h, buf, c_sigma, c_u512, 512 ); -#if __CUDA_ARCH__ >= 130 - // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind +#if __CUDA_ARCH__ <= 350 uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition]; #pragma unroll 8 - for (i=0; i < 8; ++i) { + for (int i=0; i < 8; i++) { outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); } #else - // in dieser Version passieren auch ein paar 64 Bit Shifts uint64_t *outHash = &g_hash[8 * hashPosition]; - #pragma unroll 8 - for (i=0; i < 8; ++i) - { + for (int i=0; i < 8; i++) { outHash[i] = cuda_swab64(h[i]); } #endif @@ -198,45 +195,38 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - // State vorbereiten uint64_t h[8]; - // 128 Byte für die Message uint64_t buf[16]; - uint8_t i; - // bestimme den aktuellen Zähler uint32_t nounce = startNounce + thread; #pragma unroll 8 - for(i=0;i<8;i++) + for(int i=0; i<8; i++) h[i] = d_constMem[i]; // Message für die erste Runde in Register holen #pragma unroll 16 - for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i]; + for (int i=0; i < 16; ++i) + buf[i] = c_PaddedMessage80[i]; - // die Nounce durch die thread-spezifische ersetzen - buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce)); + // The test Nonce + ((uint32_t*)buf)[19] = cuda_swab32(nounce); - // die einzige Hashing-Runde quark_blake512_compress( h, buf, c_sigma, c_u512, 640 ); - // Hash rauslassen -#if __CUDA_ARCH__ >= 130 - // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind +#if __CUDA_ARCH__ <= 350 uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; #pragma unroll 8 - for (i=0; i < 8; ++i) { - outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); + for (uint32_t i=0; i < 8; i++) { + outHash[2*i] = cuda_swab32( _HIWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); } #else - // in dieser Version passieren auch ein paar 64 Bit Shifts uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; - #pragma unroll 8 - for (i=0; i < 8; ++i) { + for (uint32_t i=0; i < 8; i++) { outHash[i] = cuda_swab64( h[i] ); } #endif + } } diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index be6eda8..a905ec4 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -6,12 +6,12 @@ extern "C" #include "sph/sph_skein.h" #include "sph/sph_jh.h" #include "sph/sph_keccak.h" +} + #include "miner.h" #include "cuda_helper.h" -} -// aus cpu-miner.c extern int device_map[8]; // Speicher für Input/Output der verketteten Hashfunktionen @@ -70,76 +70,64 @@ extern "C" void quarkhash(void *state, const void *input) unsigned char hash[64]; sph_blake512_init(&ctx_blake); - // ZBLAKE; sph_blake512 (&ctx_blake, input, 80); sph_blake512_close(&ctx_blake, (void*) hash); sph_bmw512_init(&ctx_bmw); - // ZBMW; sph_bmw512 (&ctx_bmw, (const void*) hash, 64); sph_bmw512_close(&ctx_bmw, (void*) hash); if (hash[0] & 0x8) { sph_groestl512_init(&ctx_groestl); - // ZGROESTL; sph_groestl512 (&ctx_groestl, (const void*) hash, 64); sph_groestl512_close(&ctx_groestl, (void*) hash); } else { sph_skein512_init(&ctx_skein); - // ZSKEIN; sph_skein512 (&ctx_skein, (const void*) hash, 64); sph_skein512_close(&ctx_skein, (void*) hash); } sph_groestl512_init(&ctx_groestl); - // ZGROESTL; sph_groestl512 (&ctx_groestl, (const void*) hash, 64); sph_groestl512_close(&ctx_groestl, (void*) hash); sph_jh512_init(&ctx_jh); - // ZJH; sph_jh512 (&ctx_jh, (const void*) hash, 64); sph_jh512_close(&ctx_jh, (void*) hash); if (hash[0] & 0x8) { sph_blake512_init(&ctx_blake); - // ZBLAKE; sph_blake512 (&ctx_blake, (const void*) hash, 64); sph_blake512_close(&ctx_blake, (void*) hash); } else { sph_bmw512_init(&ctx_bmw); - // ZBMW; sph_bmw512 (&ctx_bmw, (const void*) hash, 64); sph_bmw512_close(&ctx_bmw, (void*) hash); } sph_keccak512_init(&ctx_keccak); - // ZKECCAK; sph_keccak512 (&ctx_keccak, (const void*) hash, 64); sph_keccak512_close(&ctx_keccak, (void*) hash); sph_skein512_init(&ctx_skein); - // SKEIN; sph_skein512 (&ctx_skein, (const void*) hash, 64); sph_skein512_close(&ctx_skein, (void*) hash); if (hash[0] & 0x8) { sph_keccak512_init(&ctx_keccak); - // ZKECCAK; sph_keccak512 (&ctx_keccak, (const void*) hash, 64); sph_keccak512_close(&ctx_keccak, (void*) hash); } else { sph_jh512_init(&ctx_jh); - // ZJH; sph_jh512 (&ctx_jh, (const void*) hash, 64); sph_jh512_close(&ctx_jh, (void*) hash); } @@ -147,23 +135,17 @@ extern "C" void quarkhash(void *state, const void *input) memcpy(state, hash, 32); } - -extern bool opt_benchmark; - extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; + const int throughput = 256*4096; // 100; + static bool init[8] = {0,0,0,0,0,0,0,0}; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - const uint32_t Htarg = ptarget[7]; + ((uint32_t*)ptarget)[7] = 0x00FF; - const int throughput = 256*4096; // 100; - - static bool init[8] = {0,0,0,0,0,0,0,0}; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); @@ -252,11 +234,12 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { + const uint32_t Htarg = ptarget[7]; uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce); quarkhash(vhash64, endiandata); - if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { pdata[19] = foundNonce; *hashes_done = (foundNonce - first_nonce + 1)/2; diff --git a/x11/x11.cu b/x11/x11.cu index 3c18030..dc2f97f 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -21,10 +21,9 @@ extern "C" #include } -// aus cpu-miner.c +// in cpu-miner.c extern int device_map[8]; -// Speicher für Input/Output der verketteten Hashfunktionen static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); @@ -140,22 +139,17 @@ extern "C" void x11hash(void *output, const void *input) } -extern bool opt_benchmark; - extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; + const int throughput = 256*256*8; + static bool init[8] = {0,0,0,0,0,0,0,0}; if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; - const uint32_t Htarg = ptarget[7]; - - const int throughput = 256*256*8; - - static bool init[8] = {0,0,0,0,0,0,0,0}; if (!init[thr_id]) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); @@ -186,8 +180,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, cuda_check_cpu_setTarget(ptarget); do { - uint32_t foundNonce; + const uint32_t Htarg = ptarget[7]; + int order = 0; + uint32_t foundNonce; // Hash with CUDA quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); @@ -204,7 +200,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, // Scan nach Gewinner Hashes auf der GPU foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - if (foundNonce != 0xffffffff) + if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce); diff --git a/x15/x14.cu b/x15/x14.cu index 0b56584..b3519cd 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -20,11 +20,11 @@ extern "C" { #include "sph/sph_hamsi.h" #include "sph/sph_fugue.h" #include "sph/sph_shabal.h" +} #include "miner.h" #include "cuda_helper.h" -} // from cpu-miner.c extern int device_map[8]; @@ -167,8 +167,6 @@ extern "C" void x14hash(void *output, const void *input) } -extern bool opt_benchmark; - extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) diff --git a/x15/x15.cu b/x15/x15.cu index 50e2080..faea354 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -21,14 +21,11 @@ extern "C" { #include "sph/sph_fugue.h" #include "sph/sph_shabal.h" #include "sph/sph_whirlpool.h" +} #include "miner.h" #include "cuda_helper.h" -} - -// to test gpu hash on a null buffer -#define NULLTEST 0 // from cpu-miner.c extern int device_map[8]; @@ -92,8 +89,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - // X15 CPU Hash function extern "C" void x15hash(void *output, const void *input) { @@ -181,17 +176,6 @@ extern "C" void x15hash(void *output, const void *input) memcpy(output, hash, 32); } -#if NULLTEST -static void print_hash(unsigned char *hash) -{ - for (int i=0; i < 32; i += 4) { - printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]); - } -} -#endif - -extern bool opt_benchmark; - extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) @@ -203,12 +187,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, uint32_t Htarg = ptarget[7]; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = Htarg = 0x0000ff; - -#if NULLTEST - for (int k=0; k < 20; k++) - pdata[k] = 0; -#endif + ((uint32_t*)ptarget)[7] = Htarg = 0x00FF; if (!init[thr_id]) { @@ -259,12 +238,6 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); -#if NULLTEST - uint32_t buf[8]; memset(buf, 0, sizeof buf); - CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost)); - CUDA_SAFE_CALL(cudaThreadSynchronize()); - print_hash((unsigned char*)buf); printf("\n"); -#endif /* Scan with GPU */ uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x17/x17.cu b/x17/x17.cu index ffcd57a..65d2259 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -26,17 +26,15 @@ extern "C" #include "sph/sph_sha2.h" #include "sph/sph_haval.h" +} #include "miner.h" -} +#include "cuda_helper.h" static uint32_t *d_hash[8]; - -// cpu-miner.c +// in cpu-miner.c extern int device_map[8]; -extern bool opt_benchmark; - extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_setBlock_80(void *pdata); @@ -204,20 +202,12 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - const int throughput = 256*256*8; - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - static bool init[8] = {0,0,0,0,0,0,0,0}; uint32_t Htarg = ptarget[7]; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = Htarg = 0x0000ff; + ((uint32_t*)ptarget)[7] = Htarg = 0x00FF; if (!init[thr_id]) { From 402e4168534e4eaab97d024369ea7516a0153d56 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Sep 2014 21:54:46 +0200 Subject: [PATCH 39/44] Add pentablake algo (-a penta) Signed-off-by: Tanguy Pruvot --- Makefile.am | 2 +- blake32.cu | 10 +- ccminer.vcxproj | 6 + ccminer.vcxproj.filters | 3 + cpu-miner.c | 10 +- miner.h | 6 + pentablake.cu | 600 ++++++++++++++++++++++++++++++++++++++++ util.c | 4 + 8 files changed, 637 insertions(+), 4 deletions(-) create mode 100644 pentablake.cu diff --git a/Makefile.am b/Makefile.am index bb60063..c2fa11d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -33,7 +33,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \ quark/cuda_quark_compactionTest.cu \ - cuda_nist5.cu blake32.cu \ + cuda_nist5.cu blake32.cu pentablake.cu \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ diff --git a/blake32.cu b/blake32.cu index f5c0f6b..96a78a0 100644 --- a/blake32.cu +++ b/blake32.cu @@ -362,6 +362,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt blake256hash(vhashcpu, endiandata, blakerounds); if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { applog(LOG_NOTICE, "GPU found more than one result yippee!"); + rc = 2; } else { extra_results[0] = MAXU; } @@ -380,9 +381,14 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt } } + if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) { + pdata[19] = max_nonce - first_nonce + 1; + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); exit_scan: *hashes_done = pdata[19] - first_nonce + 1; @@ -395,6 +401,6 @@ exit_scan: } #endif // wait proper end of all threads - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); return rc; } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 0ab1d60..06ba665 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -405,6 +405,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)" %(AdditionalOptions) true + + 80 + --ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions) + %(AdditionalOptions) + true + --ptxas-options=-O2 %(AdditionalOptions) %(AdditionalOptions) diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index bc990e0..065e196 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -445,5 +445,8 @@ Source Files\CUDA + + Source Files\CUDA + \ No newline at end of file diff --git a/cpu-miner.c b/cpu-miner.c index 6cf40d5..85d4d2b 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -136,8 +136,9 @@ typedef enum { ALGO_JACKPOT, ALGO_MJOLLNIR, /* Mjollnir hash */ ALGO_MYR_GR, - ALGO_QUARK, ALGO_NIST5, + ALGO_PENTABLAKE, + ALGO_QUARK, ALGO_WHC, ALGO_X11, ALGO_X13, @@ -159,6 +160,7 @@ static const char *algo_names[] = { "mjollnir", "myr-gr", "nist5", + "penta", "quark", "whirl", "x11", @@ -242,6 +244,7 @@ Options:\n\ mjollnir Mjollnircoin hash\n\ myr-gr Myriad-Groestl hash\n\ nist5 NIST5 (TalkCoin) hash\n\ + penta Pentablake hash (5x Blake 512)\n\ quark Quark hash\n\ whirl Whirlcoin (old whirlpool)\n\ x11 X11 (DarkCoin) hash\n\ @@ -1089,6 +1092,11 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + case ALGO_PENTABLAKE: + rc = scanhash_pentablake(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + case ALGO_WHC: rc = scanhash_whc(thr_id, work.data, work.target, max_nonce, &hashes_done); diff --git a/miner.h b/miner.h index d9951e9..e33bfff 100644 --- a/miner.h +++ b/miner.h @@ -249,6 +249,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_pentablake(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, + unsigned long *hashes_done); + extern int scanhash_whc(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); @@ -284,6 +288,7 @@ struct work_restart { char padding[128 - sizeof(unsigned long)]; }; +extern bool opt_benchmark; extern bool opt_debug; extern bool opt_debug_rpc; extern bool opt_quiet; @@ -428,6 +433,7 @@ unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); void myriadhash(void *state, const void *input); void nist5hash(void *state, const void *input); +void pentablakehash(void *output, const void *input); void quarkhash(void *state, const void *input); void wcoinhash(void *state, const void *input); void x11hash(void *output, const void *input); diff --git a/pentablake.cu b/pentablake.cu new file mode 100644 index 0000000..9958e53 --- /dev/null +++ b/pentablake.cu @@ -0,0 +1,600 @@ +/** + * Penta Blake-512 Cuda Kernel (Tested on SM 5.0) + * + * Tanguy Pruvot - Aug. 2014 + */ + +#include "miner.h" + +extern "C" { +#include "sph/sph_blake.h" +#include +#include +} + +/* threads per block */ +#define TPB 192 + +/* hash by cpu with blake 256 */ +extern "C" void pentablakehash(void *output, const void *input) +{ + unsigned char hash[128]; + #define hashB hash + 64 + sph_blake512_context ctx; + + sph_blake512_init(&ctx); + sph_blake512(&ctx, input, 80); + sph_blake512_close(&ctx, hash); + + sph_blake512(&ctx, hash, 64); + sph_blake512_close(&ctx, hashB); + + sph_blake512(&ctx, hashB, 64); + sph_blake512_close(&ctx, hash); + + sph_blake512(&ctx, hash, 64); + sph_blake512_close(&ctx, hashB); + + sph_blake512(&ctx, hashB, 64); + sph_blake512_close(&ctx, hash); + + memcpy(output, hash, 32); +} + +#include "cuda_helper.h" + +#define MAXU 0xffffffffU + +// in cpu-miner.c +extern bool opt_n_threads; +extern bool opt_benchmark; +extern int device_map[8]; + +__constant__ +static uint32_t __align__(32) c_Target[8]; + +__constant__ +static uint64_t __align__(32) c_data[32]; + +static uint32_t *d_hash[8]; +static uint32_t *d_resNounce[8]; +static uint32_t *h_resNounce[8]; +static uint32_t extra_results[2] = { MAXU, MAXU }; + +/* prefer uint32_t to prevent size conversions = speed +5/10 % */ +__constant__ +static uint32_t __align__(32) c_sigma[16][16]; +const uint32_t host_sigma[16][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +__device__ __constant__ +static const uint64_t __align__(32) c_IV512[8] = { + 0x6a09e667f3bcc908ULL, + 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, + 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, + 0x5be0cd19137e2179ULL +}; + +__device__ __constant__ +const uint64_t c_u512[16] = +{ + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, + 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, + 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, + 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, + 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL +}; + +#define G(a,b,c,d,x) { \ + uint32_t idx1 = c_sigma[i][x]; \ + uint32_t idx2 = c_sigma[i][x+1]; \ + v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a], 32); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 25); \ + v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 11); \ +} + +// Hash-Padding +__device__ __constant__ +static const uint64_t d_constHashPadding[8] = { + 0x0000000000000080ull, + 0, + 0, + 0, + 0, + 0x0100000000000000ull, + 0, + 0x0002000000000000ull +}; + +#if 0 + +__device__ __constant__ +static const uint64_t __align__(32) c_Padding[16] = { + 0, 0, 0, 0, + 0x80000000ULL, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640, +}; + +__device__ static +void pentablake_compress(uint64_t *h, const uint64_t *block, const uint32_t T0) +{ + uint64_t v[16], m[16]; + + m[0] = block[0]; + m[1] = block[1]; + m[2] = block[2]; + m[3] = block[3]; + + for (uint32_t i = 4; i < 16; i++) { + m[i] = (T0 == 0x200) ? block[i] : c_Padding[i]; + } + + //#pragma unroll 8 + for(uint32_t i = 0; i < 8; i++) + v[i] = h[i]; + + v[ 8] = c_u512[0]; + v[ 9] = c_u512[1]; + v[10] = c_u512[2]; + v[11] = c_u512[3]; + + v[12] = xor1(c_u512[4], T0); + v[13] = xor1(c_u512[5], T0); + v[14] = c_u512[6]; + v[15] = c_u512[7]; + + for (uint32_t i = 0; i < 16; i++) { + /* column step */ + G(0, 4, 0x8, 0xC, 0x0); + G(1, 5, 0x9, 0xD, 0x2); + G(2, 6, 0xA, 0xE, 0x4); + G(3, 7, 0xB, 0xF, 0x6); + /* diagonal step */ + G(0, 5, 0xA, 0xF, 0x8); + G(1, 6, 0xB, 0xC, 0xA); + G(2, 7, 0x8, 0xD, 0xC); + G(3, 4, 0x9, 0xE, 0xE); + } + + //#pragma unroll 16 + for (uint32_t i = 0; i < 16; i++) { + uint32_t j = i % 8; + h[j] ^= v[i]; + } +} + +__global__ +void pentablake_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nounce = startNounce + thread; + uint64_t h[8]; + + #pragma unroll + for(int i=0; i<8; i++) { + h[i] = c_IV512[i]; + } + + uint64_t ending[4]; + ending[0] = c_data[16]; + ending[1] = c_data[17]; + ending[2] = c_data[18]; + ending[3] = nounce; /* our tested value */ + + pentablake_compress(h, ending, 640); + + // ----------------------------------- + + for (int r = 0; r < 4; r++) { + uint64_t data[8]; + for (int i = 0; i < 7; i++) { + data[i] = h[i]; + } + pentablake_compress(h, data, 512); /* todo: use h,h when ok*/ + } + } +} +#endif + +__device__ static +void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0) +{ + uint64_t v[16], m[16], i; + + #pragma unroll 16 + for(i = 0; i < 16; i++) { + m[i] = cuda_swab64(block[i]); + } + + #pragma unroll 8 + for (i = 0; i < 8; i++) + v[i] = h[i]; + + v[ 8] = c_u512[0]; + v[ 9] = c_u512[1]; + v[10] = c_u512[2]; + v[11] = c_u512[3]; + v[12] = c_u512[4] ^ T0; + v[13] = c_u512[5] ^ T0; + v[14] = c_u512[6]; + v[15] = c_u512[7]; + + //#pragma unroll 16 + for( i = 0; i < 16; i++) + { + /* column step */ + G(0, 4, 0x8, 0xC, 0x0); + G(1, 5, 0x9, 0xD, 0x2); + G(2, 6, 0xA, 0xE, 0x4); + G(3, 7, 0xB, 0xF, 0x6); + /* diagonal step */ + G(0, 5, 0xA, 0xF, 0x8); + G(1, 6, 0xB, 0xC, 0xA); + G(2, 7, 0x8, 0xD, 0xC); + G(3, 4, 0x9, 0xE, 0xE); + } + + //#pragma unroll 16 + for (i = 0; i < 16; i++) { + uint32_t idx = i % 8; + h[idx] ^= v[i]; + } +} + +__global__ +void pentablake_gpu_hash_80(int threads, const uint32_t startNounce, void *outputHash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t h[8]; + uint64_t buf[16]; + uint32_t nounce = startNounce + thread; + + //#pragma unroll 8 + for(int i=0; i<8; i++) + h[i] = c_IV512[i]; + + //#pragma unroll 16 + for (int i=0; i < 16; i++) + buf[i] = c_data[i]; + + // The test Nonce + ((uint32_t*)buf)[19] = cuda_swab32(nounce); + + pentablake_compress(h, buf, 640ULL); + +#if __CUDA_ARCH__ < 300 + uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; + #pragma unroll 8 + for (uint32_t i=0; i < 8; i++) { + outHash[2*i] = cuda_swab32( _HIWORD(h[i]) ); + outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); + } +#else + uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; + for (uint32_t i=0; i < 8; i++) { + outHash[i] = cuda_swab64( h[i] ); + } +#endif + + } +} + +__host__ +void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce, uint32_t *d_outputHash, int order) +{ + const int threadsperblock = TPB; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + size_t shared_size = 0; + + pentablake_gpu_hash_80 <<>> (threads, startNounce, d_outputHash); + + //MyStreamSynchronize(NULL, order, thr_id); + cudaDeviceSynchronize(); +} + + +__global__ +void pentablake_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads) + { + uint64_t *inpHash = &g_hash[thread<<3]; // hashPosition * 8 + uint64_t buf[16]; // 128 Bytes + uint64_t h[8]; // State + + #pragma unroll 8 + for (int i=0; i<8; i++) + h[i] = c_IV512[i]; + + // Message for first round + #pragma unroll 8 + for (int i=0; i < 8; ++i) + buf[i] = inpHash[i]; + + #pragma unroll 8 + for (int i=0; i < 8; i++) + buf[i+8] = d_constHashPadding[i]; + + // Ending round + pentablake_compress(h, buf, 512); + +#if __CUDA_ARCH__ < 300 + uint32_t *outHash = (uint32_t*)&g_hash[thread<<3]; + #pragma unroll 8 + for (int i=0; i < 8; i++) { + outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); + outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); + } +#else + uint64_t *outHash = &g_hash[thread<<3]; + for (int i=0; i < 8; i++) { + outHash[i] = cuda_swab64(h[i]); + } +#endif + } +} + +__host__ +void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +{ + const int threadsperblock = TPB; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + size_t shared_size = 0; + + pentablake_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_outputHash); + + //MyStreamSynchronize(NULL, order, thr_id); + cudaDeviceSynchronize(); +} + +#if 0 + +__host__ +uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce) +{ + const int threadsperblock = TPB; + uint32_t result = MAXU; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + size_t shared_size = 0; + + /* Check error on Ctrl+C or kill to prevent segfaults on exit */ + if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess) + return result; + + pentablake_gpu_hash_80<<>>(threads, startNounce, d_resNounce[thr_id]); + cudaDeviceSynchronize(); + if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + cudaThreadSynchronize(); + result = h_resNounce[thr_id][0]; + extra_results[0] = h_resNounce[thr_id][1]; + } + return result; +} +#endif + +__global__ +void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + uint32_t *inpHash = &g_hash[thread<<4]; + uint32_t h[8]; + + #pragma unroll 8 + for (int i=0; i < 8; i++) + h[i] = inpHash[i]; + + for (int i = 7; i >= 0; i--) { + uint32_t hash = h[i]; // cuda_swab32(h[i]); + if (hash > c_Target[i]) { + return; + } + if (hash < c_Target[i]) { + break; + } + } + + /* keep the smallest nounce, + extra one if found */ + if (resNounce[0] > nounce) { + resNounce[1] = resNounce[0]; + resNounce[0] = nounce; + } + else + resNounce[1] = nounce; + } +} + +__host__ static +uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order) +{ + const int threadsperblock = TPB; + uint32_t result = MAXU; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + size_t shared_size = 0; + + /* Check error on Ctrl+C or kill to prevent segfaults on exit */ + if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess) + return result; + + pentablake_gpu_check_hash <<>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); + + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + cudaThreadSynchronize(); + result = h_resNounce[thr_id][0]; + extra_results[0] = h_resNounce[thr_id][1]; + } + return result; +} + + +__host__ +void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget) +{ + uint8_t data[128]; + memcpy((void*) data, (void*) pdata, 80); + memset(data+80, 0, 48); + + // to swab... + data[80] = 0x80; + data[111] = 1; + data[126] = 0x02; + data[127] = 0x80; + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice)); +} + +extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + const uint32_t first_nonce = pdata[19]; + static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + uint32_t throughput = min(128 * 2560, max_nonce - first_nonce); + uint32_t endiandata[20]; + int rc = 0; + + if (extra_results[0] != MAXU) { + // possible extra result found in previous call + if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) { + pdata[19] = extra_results[0]; + *hashes_done = pdata[19] - first_nonce + 1; + extra_results[0] = MAXU; + rc = 1; + goto exit_scan; + } + } + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x000F; + + if (!init[thr_id]) { + if (opt_n_threads > 1) { + CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); + } + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput)); + CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t))); + + init[thr_id] = true; + } + + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + pentablake_cpu_setBlock_80(endiandata, ptarget); + + do { + int order = 0; + + // GPU HASH + pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + if (foundNonce != MAXU) + { + uint32_t vhashcpu[8]; + uint32_t Htarg = ptarget[7]; + + be32enc(&endiandata[19], foundNonce); + + pentablakehash(vhashcpu, endiandata); + + if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) + { + pdata[19] = foundNonce; + rc = 1; + + // Rare but possible if the throughput is big + be32enc(&endiandata[19], extra_results[0]); + pentablakehash(vhashcpu, endiandata); + if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { + applog(LOG_NOTICE, "GPU found more than one result yippee!"); + rc = 2; + } else { + extra_results[0] = MAXU; + } + + goto exit_scan; + } + else if (vhashcpu[7] > Htarg) { + applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg); + } + else if (vhashcpu[6] > ptarget[6]) { + applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]); + } + else { + applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce); + } + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + +exit_scan: + *hashes_done = pdata[19] - first_nonce + 1; +#if 0 + /* reset the device to allow multiple instances + * could be made in cpu-miner... check later if required */ + if (opt_n_threads == 1) { + CUDA_SAFE_CALL(cudaDeviceReset()); + init[thr_id] = false; + } +#endif + + cudaDeviceSynchronize(); + return rc; +} diff --git a/util.c b/util.c index dfe98ab..04209e0 100644 --- a/util.c +++ b/util.c @@ -1457,6 +1457,10 @@ void print_hash_tests(void) nist5hash(&hash[0], &buf[0]); printpfx("nist5", hash); + memset(hash, 0, sizeof hash); + pentablakehash(&hash[0], &buf[0]); + printpfx("pentablake", hash); + memset(hash, 0, sizeof hash); quarkhash(&hash[0], &buf[0]); printpfx("quark", hash); From 3ed36f285b17f07cd758d063178443593a095149 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 8 Sep 2014 09:31:00 +0200 Subject: [PATCH 40/44] try to prevent gpu pauses --- cpu-miner.c | 34 ++++++++++++++++++++++++++-------- util.c | 7 ++++--- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 85d4d2b..2b04c5c 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -460,11 +460,14 @@ static bool submit_upstream_work(CURL *curl, struct work *work) bool rc = false; /* pass if the previous hash is not the current previous hash */ + pthread_mutex_lock(&g_work_lock); if (memcmp(work->data + 1, g_work.data + 1, 32)) { + pthread_mutex_unlock(&g_work_lock); if (opt_debug) applog(LOG_DEBUG, "DEBUG: stale work detected, discarding"); return true; } + pthread_mutex_unlock(&g_work_lock); if (have_stratum) { uint32_t sent; @@ -894,6 +897,7 @@ static void *miner_thread(void *userdata) struct timeval tv_start, tv_end, diff; int64_t max64; uint64_t umax64; + bool extrajob = false; int rc; // &work.data[19] @@ -901,13 +905,24 @@ static void *miner_thread(void *userdata) uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); if (have_stratum) { - while (time(NULL) >= (g_work_time + opt_scantime) && !work_done) - usleep(500*1000); - work_done = false; - pthread_mutex_lock(&g_work_lock); + uint32_t sleeptime = 0; + while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) { + sleeptime++; + usleep(50*1000); + if (sleeptime > 5) { + extrajob = true; + break; + } + } + if (sleeptime) + applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100); nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); - if ((*nonceptr) >= end_nonce) + pthread_mutex_lock(&g_work_lock); + extrajob |= work_done; + if ((*nonceptr) >= end_nonce || extrajob) { + work_done = false; stratum_gen_work(&stratum, &g_work); + } } else { int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime; /* obtain new work from internal workio thread */ @@ -946,11 +961,11 @@ static void *miner_thread(void *userdata) (*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr } else (*nonceptr)++; //?? - pthread_mutex_unlock(&g_work_lock); work_restart[thr_id].restart = 0; if (opt_debug) applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr)); + pthread_mutex_unlock(&g_work_lock); /* adjust max_nonce to meet target scan time */ if (have_stratum) @@ -962,15 +977,18 @@ static void *miner_thread(void *userdata) max64 *= (int64_t)thr_hashrates[thr_id]; if (max64 <= 0) { + /* should not be set too high, + else you can miss multiple nounces */ switch (opt_algo) { case ALGO_JACKPOT: max64 = 0x1fffLL; break; case ALGO_BLAKECOIN: max64 = 0x3ffffffLL; + break; case ALGO_BLAKE: /* based on the 750Ti hashrate (100kH) */ - max64 = 0x3ffffffLL; + max64 = 0x1ffffffLL; break; default: max64 = 0xfffffLL; @@ -1008,7 +1026,7 @@ static void *miner_thread(void *userdata) work_restart[thr_id].restart = 1; hashlog_purge_old(); // wait a bit for a new job... - sleep(1); + usleep(500*1000); (*nonceptr) = end_nonce + 1; work_done = true; continue; diff --git a/util.c b/util.c index 04209e0..6ed44c0 100644 --- a/util.c +++ b/util.c @@ -1020,7 +1020,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) int merkle_count, i; json_t *merkle_arr; unsigned char **merkle; - int ntime; + int ntime, hoffset; job_id = json_string_value(json_array_get(params, 0)); prevhash = json_string_value(json_array_get(params, 1)); @@ -1078,7 +1078,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) hex2bin(sctx->job.coinbase, coinb1, coinb1_size); memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); - sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + 43); + hoffset = coinb1_size - 15; // 43; + sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + hoffset); if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); @@ -1125,7 +1126,7 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) sctx->next_diff = diff; pthread_mutex_unlock(&sctx->work_lock); - applog(LOG_INFO, "Stratum difficulty set to %g", diff); + applog(LOG_WARNING, "Stratum difficulty set to %g", diff); return true; } From cec5baea9527fcdb11c2db4f22cd7028e3e50357 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 8 Sep 2014 10:42:40 +0200 Subject: [PATCH 41/44] enable colors by default, except for syslog debug: show compared hash diffs in color --- blake32.cu | 2 +- cpu-miner.c | 22 +++++++++++----------- miner.h | 1 + util.c | 14 ++++++++++++++ 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/blake32.cu b/blake32.cu index 96a78a0..6b51e2e 100644 --- a/blake32.cu +++ b/blake32.cu @@ -361,7 +361,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt be32enc(&endiandata[19], extra_results[0]); blake256hash(vhashcpu, endiandata, blakerounds); if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { - applog(LOG_NOTICE, "GPU found more than one result yippee!"); + applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!"); rc = 2; } else { extra_results[0] = MAXU; diff --git a/cpu-miner.c b/cpu-miner.c index 2b04c5c..77a7526 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -181,7 +181,7 @@ bool want_stratum = true; bool have_stratum = false; static bool submit_old = false; bool use_syslog = false; -bool use_colors = false; +bool use_colors = true; static bool opt_background = false; bool opt_quiet = false; static int opt_retries = -1; @@ -940,12 +940,11 @@ static void *miner_thread(void *userdata) } if (memcmp(work.data, g_work.data, wcmplen)) { if (opt_debug) { - applog(LOG_DEBUG, "job %s work updated", g_work.job_id); - for (int n=0; n range.scanned[0] && start_nonce < range.scanned[1]); if (stall) { - if (opt_algo) + if (opt_debug && !opt_quiet) applog(LOG_DEBUG, "job done, wait for a new one..."); work_restart[thr_id].restart = 1; hashlog_purge_old(); @@ -1464,13 +1463,14 @@ static void parse_arg (int key, char *arg) case 'C': use_colors = true; break; - case 'q': - opt_quiet = true; - break; case 'D': opt_debug = true; opt_debug_rpc = true; break; + case 'q': + opt_quiet = true; + opt_debug_rpc = false; + break; case 'p': free(rpc_pass); rpc_pass = strdup(arg); diff --git a/miner.h b/miner.h index e33bfff..6bd0ae5 100644 --- a/miner.h +++ b/miner.h @@ -422,6 +422,7 @@ size_t time2str(char* buf, time_t timer); char* atime2str(time_t timer); void applog_hash(unsigned char *hash); +void applog_compare_hash(unsigned char *hash, unsigned char *hash2); void print_hash_tests(void); void animehash(void *state, const void *input); diff --git a/util.c b/util.c index 6ed44c0..eb4af11 100644 --- a/util.c +++ b/util.c @@ -1401,6 +1401,20 @@ static char* format_hash(char* buf, unsigned char *hash) return buf; } +/* to debug diff in data */ +extern void applog_compare_hash(unsigned char *hash, unsigned char *hash2) +{ + char s[256] = ""; + int len = 0; + for (int i=0; i < 32; i += 4) { + char *color = memcmp(hash+i, hash2+i, 4) ? CL_RED : CL_GRY; + len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color, + hash[i], hash[i+1], hash[i+2], hash[i+3]); + s[len] = '\0'; + } + applog(LOG_DEBUG, "%s", s); +} + extern void applog_hash(unsigned char *hash) { char s[128] = {'\0'}; From 9e5ec398b28f56ec88abdbae96538430a88c9ac6 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 8 Sep 2014 11:07:16 +0200 Subject: [PATCH 42/44] Purge anti-dup data on target change --- cpu-miner.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index 77a7526..c03786b 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -938,6 +938,19 @@ static void *miner_thread(void *userdata) g_work_time = time(NULL); } } + if (memcmp(work.target, g_work.target, sizeof(work.target))) { + if (opt_debug) { + applog(LOG_DEBUG, "job %s target change:", g_work.job_id); + applog_hash((uint8_t*) work.target); + applog_compare_hash((uint8_t*) g_work.target, (uint8_t*) work.target); + } + memcpy(work.target, g_work.target, sizeof(work.target)); + (*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr + /* on new target, ignoring nonce, clear sent data (hashlog) */ + if (memcmp(work.target, g_work.target, sizeof(work.target) - 4)) { + hashlog_purge_job(work.job_id); + } + } if (memcmp(work.data, g_work.data, wcmplen)) { if (opt_debug) { for (int n=0; n <= (wcmplen-8); n+=8) { @@ -950,14 +963,6 @@ static void *miner_thread(void *userdata) } memcpy(&work, &g_work, sizeof(struct work)); (*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr - } else if (memcmp(work.target, g_work.target, sizeof(work.target))) { - if (opt_debug) { - applog(LOG_DEBUG, "job %s target change:", g_work.job_id); - applog_hash((uint8_t*) work.target); - applog_compare_hash((uint8_t*) g_work.target, (uint8_t*) work.target); - } - memcpy(work.target, g_work.target, sizeof(work.target)); - (*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr } else (*nonceptr)++; //?? work_restart[thr_id].restart = 0; From 13bb9d267ef17434392830c7fbf0c602f7f202fd Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 8 Sep 2014 17:43:45 +0200 Subject: [PATCH 43/44] Remove debug rpc, already exists with -P --- cpu-miner.c | 71 +++++++++++++++++++++++++++++++++------------------ cuda_helper.h | 2 ++ miner.h | 1 - util.c | 4 --- 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/cpu-miner.c b/cpu-miner.c index c03786b..0450ac7 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -172,7 +172,6 @@ static const char *algo_names[] = { }; bool opt_debug = false; -bool opt_debug_rpc = false; bool opt_protocol = false; bool opt_benchmark = false; bool want_longpoll = true; @@ -440,7 +439,7 @@ static int share_result(int result, const char *reason) (result ? CL_GRN "yay!!!" : CL_RED "booooo") : (result ? "(yay!!!)" : "(booooo)")); - if (reason && !opt_quiet) { + if (reason) { applog(LOG_WARNING, "reject reason: %s", reason); if (strncmp(reason, "low difficulty share", 20) == 0) { opt_difficulty = (opt_difficulty * 2.0) / 3.0; @@ -550,10 +549,6 @@ static bool submit_upstream_work(CURL *curl, struct work *work) json_decref(val); } - if (opt_debug_rpc) { - applog(LOG_DEBUG, "submit: %s", s); - } - rc = true; out: @@ -792,13 +787,20 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); /* Generate merkle root */ - if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR) - heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); - else - if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_WHC || opt_algo == ALGO_BLAKECOIN) - SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root); - else - sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + switch (opt_algo) { + case ALGO_HEAVY: + case ALGO_MJOLLNIR: + heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + break; + case ALGO_FUGUE256: + case ALGO_GROESTL: + case ALGO_BLAKECOIN: + case ALGO_WHC: + SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root); + break; + default: + sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + } for (i = 0; i < sctx->job.merkle_count; i++) { memcpy(merkle_root + 32, sctx->job.merkle[i], 32); @@ -870,7 +872,9 @@ static void *miner_thread(void *userdata) uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1); unsigned char *scratchbuf = NULL; bool work_done = false; + bool extrajob = false; char s[16]; + int rc = 0; memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized @@ -897,8 +901,6 @@ static void *miner_thread(void *userdata) struct timeval tv_start, tv_end, diff; int64_t max64; uint64_t umax64; - bool extrajob = false; - int rc; // &work.data[19] int wcmplen = 76; @@ -907,20 +909,21 @@ static void *miner_thread(void *userdata) if (have_stratum) { uint32_t sleeptime = 0; while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) { - sleeptime++; - usleep(50*1000); - if (sleeptime > 5) { + usleep(100*1000); + if (sleeptime > 4) { extrajob = true; break; } + sleeptime++; } - if (sleeptime) + if (sleeptime && opt_debug && !opt_quiet) applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100); nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); pthread_mutex_lock(&g_work_lock); extrajob |= work_done; if ((*nonceptr) >= end_nonce || extrajob) { work_done = false; + extrajob = false; stratum_gen_work(&stratum, &g_work); } } else { @@ -938,6 +941,22 @@ static void *miner_thread(void *userdata) g_work_time = time(NULL); } } +#if 0 + if (!opt_benchmark && g_work.xnonce2_len == 0) { + applog(LOG_ERR, "work data not read yet"); + extrajob = true; + work_done = true; + sleep(1); + continue; + } +#endif + if (rc > 1) { + /* if we found more than one on last loop */ + /* todo: handle an array to get them directly */ + pthread_mutex_unlock(&g_work_lock); + goto continue_scan; + } + if (memcmp(work.target, g_work.target, sizeof(work.target))) { if (opt_debug) { applog(LOG_DEBUG, "job %s target change:", g_work.job_id); @@ -947,7 +966,7 @@ static void *miner_thread(void *userdata) memcpy(work.target, g_work.target, sizeof(work.target)); (*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr /* on new target, ignoring nonce, clear sent data (hashlog) */ - if (memcmp(work.target, g_work.target, sizeof(work.target) - 4)) { + if (memcmp(work.target, g_work.target, sizeof(work.target))) { hashlog_purge_job(work.job_id); } } @@ -1048,6 +1067,7 @@ static void *miner_thread(void *userdata) (*nonceptr) = start_nonce; hashes_done = 0; +continue_scan: gettimeofday(&tv_start, NULL); /* scan nonces for a proof-of-work hash */ @@ -1163,8 +1183,11 @@ static void *miner_thread(void *userdata) timeval_subtract(&diff, &tv_end, &tv_start); if (diff.tv_usec || diff.tv_sec) { pthread_mutex_lock(&stats_lock); - thr_hashrates[thr_id] = - hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); + if (diff.tv_sec + 1e-6 * diff.tv_usec > 0.0) { + thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); + if (rc > 1) + thr_hashrates[thr_id] = (rc * hashes_done) / (diff.tv_sec + 1e-6 * diff.tv_usec); + } pthread_mutex_unlock(&stats_lock); } if (!opt_quiet) { @@ -1372,7 +1395,6 @@ static void *stratum_thread(void *userdata) pthread_mutex_lock(&g_work_lock); stratum_gen_work(&stratum, &g_work); time(&g_work_time); - pthread_mutex_unlock(&g_work_lock); if (stratum.job.clean) { if (!opt_quiet) applog(LOG_BLUE, "%s send a new %s block %d", short_url, algo_names[opt_algo], @@ -1383,6 +1405,7 @@ static void *stratum_thread(void *userdata) applog(LOG_BLUE, "%s send job %d for block %d", short_url, strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height); } + pthread_mutex_unlock(&g_work_lock); } if (!stratum_socket_full(&stratum, 120)) { @@ -1470,11 +1493,9 @@ static void parse_arg (int key, char *arg) break; case 'D': opt_debug = true; - opt_debug_rpc = true; break; case 'q': opt_quiet = true; - opt_debug_rpc = false; break; case 'p': free(rpc_pass); diff --git a/cuda_helper.h b/cuda_helper.h index fecf531..66c8e7d 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -12,6 +12,8 @@ #include +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + extern __device__ __device_builtin__ void __syncthreads(void); #ifndef __CUDA_ARCH__ diff --git a/miner.h b/miner.h index 6bd0ae5..d9d29a8 100644 --- a/miner.h +++ b/miner.h @@ -290,7 +290,6 @@ struct work_restart { extern bool opt_benchmark; extern bool opt_debug; -extern bool opt_debug_rpc; extern bool opt_quiet; extern bool opt_protocol; extern int opt_timeout; diff --git a/util.c b/util.c index eb4af11..fe733c0 100644 --- a/util.c +++ b/util.c @@ -1222,10 +1222,6 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) id = json_object_get(val, "id"); params = json_object_get(val, "params"); - if (opt_debug_rpc) { - applog(LOG_DEBUG, "method: %s", s); - } - if (!strcasecmp(method, "mining.notify")) { ret = stratum_notify(sctx, params); goto out; From 429266346ce4a1376688bce56d3131c2d973c721 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 8 Sep 2014 20:48:51 +0200 Subject: [PATCH 44/44] Prepare version 1.4.2 --- README.txt | 4 +++- configure.ac | 2 +- cpu-miner.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/README.txt b/README.txt index 36a3e00..001ff3f 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner release 1.4.1-tpruvot (Sep 06th 2014) - "Cached Blake" +ccMiner release 1.4.2-tpruvot (Sep 09th 2014) - "Pentablake" --------------------------------------------------------------- *************************************************************** @@ -35,6 +35,7 @@ TalkCoin DarkCoin and other X11 coins NEOS blake (256 14-rounds) BlakeCoin (256 8-rounds) +Pentablake (Blake 512 x5) where some of these coins have a VERY NOTABLE nVidia advantage over competing AMD (OpenCL) implementations. @@ -65,6 +66,7 @@ its command line interface and options. blake use to mine NEOS (Blake 256) blakecoin use to mine Old Blake 256 nist5 use to mine TalkCoin + penta use to mine Joincoin / Pentablake fresh use to mine Freshcoin whirl use to mine Whirlcoin x11 use to mine DarkCoin diff --git a/configure.ac b/configure.ac index 2a554f1..a4ef290 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.09.06]) +AC_INIT([ccminer], [2014.09.09]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 0450ac7..cb6e790 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -1427,7 +1427,7 @@ out: return NULL; } -#define PROGRAM_VERSION "1.4.1" +#define PROGRAM_VERSION "1.4.2" static void show_version_and_exit(void) { printf("%s v%s\n"