diff --git a/Makefile.am b/Makefile.am index 63cfb32..e8a35ca 100644 --- a/Makefile.am +++ b/Makefile.am @@ -7,8 +7,8 @@ else JANSSON_INCLUDES= endif -EXTRA_DIST = autogen.sh README.txt LICENSE.txt \ - cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \ +EXTRA_DIST = autogen.sh README.md README.txt LICENSE.txt \ + ccminer.sln ccminer.vcxproj ccminer.vcxproj.filters \ compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in SUBDIRS = compat @@ -121,3 +121,6 @@ JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu skein.o: skein.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< +skein2.o: skein2.cu + $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< + diff --git a/README.md b/README.md index 073a7c9..a863fec 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ccminer Based on Christian Buchner's & Christian H.'s CUDA project, no more active on github recently. -Fork by tpruvot@github with X14,X15,X17,WHIRL,Blake256,BlakeCoin,Lyra2RE and some others, check the [README.txt](README.txt) +Fork by tpruvot@github with X14,X15,X17,Blake256,BlakeCoin,Lyra2RE,Skein,ZR5 and others, check the [README.txt](README.txt) BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer) @@ -30,5 +30,7 @@ The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64 To rebuild them, you need to clone this repository and its submodules : git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows -There is also a [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website. +On Linux, you can use the helper ./build.sh (edit it if required) + +There is also an old [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website. diff --git a/README.txt b/README.txt index f4b27d3..f169ee4 100644 --- a/README.txt +++ b/README.txt @@ -1,16 +1,15 @@ -ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX" +ccMiner release 1.6.1-tpruvot (Apr 2015) - "Skein512 Chainsaw" --------------------------------------------------------------- *************************************************************** -If you find this tool useful and like to support its continued +If you find this tool useful and like to support its continuous development, then consider a donation. tpruvot@github: - BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo + BTC : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo DRK : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3 - NEOS : NaEcVrdzoCWHUYXb7X8QoafoKS9UV69Yk4 - XST : S9TqZucWgT6ajZLDBxQnHUtmkotCEHn9z9 + ZRC : ZEcubH2xp2mpuwxMjy7wZThr5AzLGu3mqT DJM34: BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze @@ -56,7 +55,7 @@ that the most of our comments are in german. >>> Command Line Interface <<< -This code is based on the pooler cpuminer 2.3.2 release and inherits +This code is based on the pooler cpuminer and inherits its command line interface and options. -a, --algo=ALGO specify the algorithm to use @@ -83,7 +82,6 @@ its command line interface and options. s3 use to mine 1coin skein use to mine Skeincoin skein2 use to mine Woodcoin - whirl use to mine Whirlcoin whirlpoolx use to mine Vanillacoin x11 use to mine DarkCoin x14 use to mine X14Coin @@ -113,20 +111,23 @@ its command line interface and options. -T, --timeout=N network timeout, in seconds (default: 270) -s, --scantime=N upper bound on time spent scanning current work when long polling is unavailable, in seconds (default: 5) + -n, --ndevs list cuda devices -N, --statsavg number of samples used to display hashrate (default: 30) --no-gbt disable getblocktemplate support (height check in solo) --no-longpoll disable X-Long-Polling support --no-stratum disable X-Stratum support -q, --quiet disable per-thread hashmeter output + --no-color disable colored output -D, --debug enable debug output -P, --protocol-dump verbose dump of protocol-level activities -b, --api-bind IP/Port for the miner API (default: 127.0.0.1:4068) + --api-remote Allow remote control + -B, --background run the miner in the background --benchmark run in offline benchmark mode --cputest debug hashes from cpu algorithms --cpu-affinity set process affinity to specific cpu core(s) mask --cpu-priority set process priority (default: 0 idle, 2 normal to 5 highest) -c, --config=FILE load a JSON-format configuration file - --no-color disable colored console output -V, --version display version information and exit -h, --help display this help text and exit @@ -189,6 +190,9 @@ features. >>> RELEASE HISTORY <<< + Apr. 14th 2015 v1.6.1 + Add the Double Skein Algo for Woodcoin + Mar. 27th 2015 v1.6.0 Add the ZR5 Algo for Ziftcoin Implement Skeincoin algo (skein + sha) @@ -347,8 +351,8 @@ Christian Buchner, Christian H. (Germany): Initial CUDA implementation djm34, tsiv, sp and klausT for cuda algos implementation and optimisation -Tanguy Pruvot : 750Ti tuning, blake, colors, general code cleanup/opts - API monitoring, linux Config/Makefile and vstudio stuff... +Tanguy Pruvot : 750Ti tuning, blake, colors, zr5, skein, general code cleanup + API monitoring, linux Config/Makefile and vstudio libs... and also many thanks to anyone else who contributed to the original cpuminer application (Jeff Garzik, pooler), it's original HVC-fork diff --git a/ccminer.cpp b/ccminer.cpp index a1ab2f4..def1420 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -263,7 +263,6 @@ Options:\n\ x14 X14\n\ x15 X15\n\ x17 X17 (peoplecurrency)\n\ - whirl Whirlcoin (old whirlpool)\n\ whirlpoolx Vanilla coin\n\ zr5 ZR5 (ZiftrCoin)\n\ -d, --devices Comma separated list of CUDA devices to use.\n\ @@ -1396,7 +1395,7 @@ static void *miner_thread(void *userdata) max64 = max(minmax-1, max64); } - // we can't scan more than uint capacity + // we can't scan more than uint32 capacity max64 = min(UINT32_MAX, max64); start_nonce = nonceptr[0]; @@ -1412,6 +1411,11 @@ static void *miner_thread(void *userdata) // todo: keep it rounded for gpu threads ? + if (unlikely(start_nonce > max_nonce)) { + // should not happen but seen in skein2 benchmark with 2 gpus + max_nonce = end_nonce = UINT32_MAX; + } + work.scanned_from = start_nonce; nonceptr[0] = start_nonce; @@ -1537,6 +1541,7 @@ static void *miner_thread(void *userdata) max_nonce, &hashes_done); break; + /* to be deleted */ case ALGO_WHIRLCOIN: rc = scanhash_whc(thr_id, work.data, work.target, max_nonce, &hashes_done); diff --git a/configure.ac b/configure.ac index 45b5254..61b47e7 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [1.6.1-git]) +AC_INIT([ccminer], [1.6.1]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/miner.h b/miner.h index 65ee5ae..71bf153 100644 --- a/miner.h +++ b/miner.h @@ -686,7 +686,6 @@ void qubithash(void *state, const void *input); void skeincoinhash(void *output, const void *input); void skein2hash(void *output, const void *input); void s3hash(void *output, const void *input); -void wcoinhash(void *state, const void *input); void whirlxHash(void *state, const void *input); void x11hash(void *output, const void *input); void x13hash(void *output, const void *input); diff --git a/quark/cuda_bmw512_30.cu b/quark/cuda_bmw512_30.cu index 358a3b3..8b2e858 100644 --- a/quark/cuda_bmw512_30.cu +++ b/quark/cuda_bmw512_30.cu @@ -12,8 +12,7 @@ q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \ q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) -static __constant__ uint64_t d_constMem[16]; -static uint64_t h_constMem[16] = { +static __constant__ uint64_t d_constMem[16] = { SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), SPH_C64(0x9091929394959697), diff --git a/skein.cu b/skein.cu index 6305ad5..76f8da9 100644 --- a/skein.cu +++ b/skein.cu @@ -3,9 +3,7 @@ * by tpruvot@github - 2015 */ -extern "C" { #include "sph/sph_skein.h" -} #include "miner.h" #include "cuda_helper.h" @@ -101,70 +99,10 @@ void sha256_transform_gpu(uint32_t *state, uint32_t *message) } /* 3. Mix. */ - RNDr(S, W, 0); - RNDr(S, W, 1); - RNDr(S, W, 2); - RNDr(S, W, 3); - RNDr(S, W, 4); - RNDr(S, W, 5); - RNDr(S, W, 6); - RNDr(S, W, 7); - RNDr(S, W, 8); - RNDr(S, W, 9); - RNDr(S, W, 10); - RNDr(S, W, 11); - RNDr(S, W, 12); - RNDr(S, W, 13); - RNDr(S, W, 14); - RNDr(S, W, 15); - RNDr(S, W, 16); - RNDr(S, W, 17); - RNDr(S, W, 18); - RNDr(S, W, 19); - RNDr(S, W, 20); - RNDr(S, W, 21); - RNDr(S, W, 22); - RNDr(S, W, 23); - RNDr(S, W, 24); - RNDr(S, W, 25); - RNDr(S, W, 26); - RNDr(S, W, 27); - RNDr(S, W, 28); - RNDr(S, W, 29); - RNDr(S, W, 30); - RNDr(S, W, 31); - RNDr(S, W, 32); - RNDr(S, W, 33); - RNDr(S, W, 34); - RNDr(S, W, 35); - RNDr(S, W, 36); - RNDr(S, W, 37); - RNDr(S, W, 38); - RNDr(S, W, 39); - RNDr(S, W, 40); - RNDr(S, W, 41); - RNDr(S, W, 42); - RNDr(S, W, 43); - RNDr(S, W, 44); - RNDr(S, W, 45); - RNDr(S, W, 46); - RNDr(S, W, 47); - RNDr(S, W, 48); - RNDr(S, W, 49); - RNDr(S, W, 50); - RNDr(S, W, 51); - RNDr(S, W, 52); - RNDr(S, W, 53); - RNDr(S, W, 54); - RNDr(S, W, 55); - RNDr(S, W, 56); - RNDr(S, W, 57); - RNDr(S, W, 58); - RNDr(S, W, 59); - RNDr(S, W, 60); - RNDr(S, W, 61); - RNDr(S, W, 62); - RNDr(S, W, 63); + #pragma unroll + for (int i = 0; i < 64; i++) { + RNDr(S, W, i); + } for (int i = 0; i < 8; i++) state[i] += S[i]; @@ -408,8 +346,8 @@ extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata, const uint32_t first_nonce = pdata[19]; const int swap = 1; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8 - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8 + throughput = min(throughput, (max_nonce - first_nonce)); if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x07; diff --git a/skein2.cu b/skein2.cu index 82592de..17fd51e 100644 --- a/skein2.cu +++ b/skein2.cu @@ -1,6 +1,9 @@ -extern "C" { +/** + * SKEIN512 80 + SKEIN512 64 (Woodcoin) + * by tpruvot@github - 2015 + */ + #include "sph/sph_skein.h" -} #include "miner.h" #include "cuda_helper.h" @@ -8,21 +11,19 @@ extern "C" { static uint32_t *d_hash[MAX_GPUS]; extern void skein512_cpu_setBlock_80(void *pdata); -extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); +extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern "C" void skein2hash(void *output, const void *input) { + uint32_t _ALIGN(64) hash[16]; sph_skein512_context ctx_skein; - uint32_t hash[16]; - sph_skein512_init(&ctx_skein); sph_skein512(&ctx_skein, input, 80); sph_skein512_close(&ctx_skein, hash); - //applog_hash((uchar*)hash); - //applog_hash((uchar*)&hash[8]); + sph_skein512_init(&ctx_skein); sph_skein512(&ctx_skein, hash, 64); sph_skein512_close(&ctx_skein, hash); @@ -30,21 +31,6 @@ extern "C" void skein2hash(void *output, const void *input) memcpy(output, hash, 32); } -#ifdef _DEBUG -#define TRACE(algo) { \ - if (max_nonce == 1 && pdata[19] <= 1) { \ - uint32_t* debugbuf = NULL; \ - cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \ - cudaMemcpy(debugbuf, d_hash[thr_id], 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \ - printf("SK2 %s %08x %08x %08x %08x...\n", algo, \ - swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3])); \ - cudaFree(debugbuf); \ - } \ -} -#else -#define TRACE(algo) {} -#endif - static bool init[MAX_GPUS] = { 0 }; extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata, @@ -53,25 +39,25 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata, { const uint32_t first_nonce = pdata[19]; - uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8 - throughput = min(throughput, (max_nonce - first_nonce)); + uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8 + throughput = min(throughput, (max_nonce - first_nonce)); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0FFF; + ((uint32_t*)ptarget)[7] = 0; if (!init[thr_id]) { cudaDeviceReset(); cudaSetDevice(device_map[thr_id]); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64UL * throughput)); cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; } uint32_t endiandata[20]; - for (int k=0; k < 20; k++) + for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); skein512_cpu_setBlock_80((void*)endiandata); @@ -81,24 +67,22 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata, int order = 0; // Hash with CUDA - skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - TRACE("80:"); + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 0); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("64:"); + + *hashes_done = pdata[19] - first_nonce + throughput; uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX) { - uint32_t vhash64[8]; + uint32_t _ALIGN(64) vhash64[8]; endiandata[19] = foundNonce; skein2hash(vhash64, endiandata); - #define Htarg ptarget[7] - if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { int res = 1; uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); - *hashes_done = pdata[19] - first_nonce + throughput; if (secNonce != 0) { if (!opt_quiet) applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", device_map[thr_id], swab32(secNonce)); @@ -113,10 +97,13 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata, } } + if (((uint64_t) throughput + pdata[19]) > max_nonce) { + break; + } + pdata[19] += throughput; - } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart); - *hashes_done = pdata[19] - first_nonce + 1; return 0; } diff --git a/util.cpp b/util.cpp index 0b9e012..366cc3e 100644 --- a/util.cpp +++ b/util.cpp @@ -1797,9 +1797,6 @@ void print_hash_tests(void) s3hash(&hash[0], &buf[0]); printpfx("S3", hash); - wcoinhash(&hash[0], &buf[0]); - printpfx("whirl", hash); - whirlxHash(&hash[0], &buf[0]); printpfx("whirlpoolx", hash); diff --git a/x11/x11.cu b/x11/x11.cu index 6be4c4d..d5802df 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -131,7 +131,7 @@ extern "C" void x11hash(void *output, const void *input) uint32_t* debugbuf = NULL; \ cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \ cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \ - printf("%s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ + printf("X11 %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ swab32(debugbuf[2]), swab32(debugbuf[3])); \ cudaFreeHost(debugbuf); \ } \ @@ -209,7 +209,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("simd :"); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("echo X11 =>"); + TRACE("echo => "); foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (foundNonce != UINT32_MAX)