From cbede12c4701b98356e55bf32b75cb3bb82a9b24 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 19 Jul 2017 13:52:45 +0200 Subject: [PATCH] Add skunk algo, initial version --- Makefile.am | 2 +- README.txt | 13 ++- algos.h | 2 + bench.cpp | 1 + ccminer.cpp | 4 + ccminer.vcxproj | 9 +- ccminer.vcxproj.filters | 3 + compat/ccminer-config.h | 2 +- configure.ac | 2 +- miner.h | 3 + skunk.cu | 194 +++++++++++++++++++++++++++++++++++++++ util.cpp | 3 + x13/cuda_x13_fugue512.cu | 2 +- 13 files changed, 228 insertions(+), 12 deletions(-) create mode 100644 skunk.cu diff --git a/Makefile.am b/Makefile.am index a9fb339..febcc9b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -55,7 +55,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ quark/nist5.cu \ quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \ neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \ - pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \ + pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp skunk.cu zr5.cu \ sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu \ sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ diff --git a/README.txt b/README.txt index 9e4a9c9..4967b5a 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccminer 2.1-dev (June 2017) "Equihash" +ccminer 2.2-dev (July 2017) "Equihash, tribus and skunk" --------------------------------------------------------------- *************************************************************** @@ -9,7 +9,6 @@ If you find this tool useful and like to support its continuous tpruvot@github: BTC : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo DCR : DsUCcACGcyP8McNMRXQwbtpDxaVUYLDQDeU - LBC : bKe6pLqELL3HHSbpJXxSdn5RrY2bfrkRhF Alexis: BTC : 14EgXD7fPYD4sHBXWUi46VeiTVXNq765B8 @@ -51,6 +50,8 @@ Scrypt and Scrypt:N Scrypt-Jane (Chacha) Sibcoin (sib) Skein (Skein + SHA) +Signatum (Skein cubehash fugue Streebog) +Tribus (JH, keccak, simd) Woodcoin (Double Skein) Vanilla (Blake256 8-rounds - double sha256) Vertcoin Lyra2RE @@ -116,6 +117,7 @@ its command line interface and options. sib use to mine Sibcoin skein use to mine Skeincoin skein2 use to mine Woodcoin + skunk use to mine Signatum timetravel use to mine MachineCoin tribus use to mine Denarius x11evo use to mine Revolver @@ -280,7 +282,12 @@ features. >>> RELEASE HISTORY <<< - v2.1 (unfinished) + v2.2 (under dev) + New skunk algo, using the heavy streebog algorithm + Enhance tribus algo (+10%) + equihash protocol enhancement on yiimp.ccminer.org and zpool.ca + + June 16th 2017 v2.1-tribus Interface equihash algo with djeZo solver (from nheqminer 0.5c) New api parameters (and multicast announces for local networks) New tribus algo diff --git a/algos.h b/algos.h index 77625f2..b7dd0f2 100644 --- a/algos.h +++ b/algos.h @@ -45,6 +45,7 @@ enum sha_algos { ALGO_SIB, ALGO_SKEIN, ALGO_SKEIN2, + ALGO_SKUNK, ALGO_S3, ALGO_TIMETRAVEL, ALGO_TRIBUS, @@ -109,6 +110,7 @@ static const char *algo_names[] = { "sib", "skein", "skein2", + "skunk", "s3", "timetravel", "tribus", diff --git a/bench.cpp b/bench.cpp index 71c0d70..500176f 100644 --- a/bench.cpp +++ b/bench.cpp @@ -80,6 +80,7 @@ void algo_free_all(int thr_id) free_qubit(thr_id); free_skeincoin(thr_id); free_skein2(thr_id); + free_skunk(thr_id); free_sha256d(thr_id); free_sha256t(thr_id); free_sia(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index e32cdbd..2f357cc 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -275,6 +275,7 @@ Options:\n\ scrypt-jane Scrypt-jane Chacha\n\ skein Skein SHA2 (Skeincoin)\n\ skein2 Double Skein (Woodcoin)\n\ + skunk Skein Cube Fugue Streebog\n\ s3 S3 (1Coin)\n\ timetravel Machinecoin permuted x8\n\ tribus Denerius\n\ @@ -2401,6 +2402,9 @@ static void *miner_thread(void *userdata) case ALGO_SKEIN2: rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_SKUNK: + rc = scanhash_skunk(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_SHA256D: rc = scanhash_sha256d(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 552c595..a68f66b 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -532,6 +532,7 @@ 48 + @@ -563,10 +564,8 @@ - - - - + + @@ -600,4 +599,4 @@ - + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index f28d993..f6fef2c 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -754,6 +754,9 @@ Source Files\CUDA + + Source Files\CUDA + Source Files\CUDA diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index dfd973c..d07e736 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -164,7 +164,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.1" +#define PACKAGE_VERSION "2.2" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/configure.ac b/configure.ac index c92a6fc..c369201 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.1], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/miner.h b/miner.h index f3406dd..b9c7256 100644 --- a/miner.h +++ b/miner.h @@ -308,6 +308,7 @@ extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsig extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_skunk(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -368,6 +369,7 @@ extern void free_sia(int thr_id); extern void free_sib(int thr_id); extern void free_skeincoin(int thr_id); extern void free_skein2(int thr_id); +extern void free_skunk(int thr_id); extern void free_s3(int thr_id); extern void free_timetravel(int thr_id); extern void free_tribus(int thr_id); @@ -909,6 +911,7 @@ void sha256t_hash(void *output, const void *input); void sibhash(void *output, const void *input); void skeincoinhash(void *output, const void *input); void skein2hash(void *output, const void *input); +void skunk_hash(void *state, const void *input); void s3hash(void *output, const void *input); void timetravel_hash(void *output, const void *input); void bitcore_hash(void *output, const void *input); diff --git a/skunk.cu b/skunk.cu new file mode 100644 index 0000000..d7d9deb --- /dev/null +++ b/skunk.cu @@ -0,0 +1,194 @@ +/** + * Skunk Algo for Signatum + * (skein, cube, fugue, gost streebog) + * + * tpruvot@github 06 2017 - GPLv3 + */ +extern "C" { +#include "sph/sph_skein.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_fugue.h" +#include "sph/sph_streebog.h" +} + +#include "miner.h" +#include "cuda_helper.h" + +extern void skein512_cpu_setBlock_80(void *pdata); +extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); +extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); + +extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads); +extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); + +extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); +extern void streebog_set_target(const uint32_t* ptarget); + +#include +#include + +#define NBN 2 +static uint32_t *d_hash[MAX_GPUS]; +static uint32_t *d_resNonce[MAX_GPUS]; + +// CPU Hash +extern "C" void skunk_hash(void *output, const void *input) +{ + unsigned char _ALIGN(128) hash[128] = { 0 }; + + sph_skein512_context ctx_skein; + sph_cubehash512_context ctx_cubehash; + sph_fugue512_context ctx_fugue; + sph_gost512_context ctx_gost; + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, input, 80); + sph_skein512_close(&ctx_skein, (void*) hash); + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, (const void*) hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*) hash); + + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, (const void*) hash, 64); + sph_fugue512_close(&ctx_fugue, (void*) hash); + + sph_gost512_init(&ctx_gost); + sph_gost512(&ctx_gost, (const void*) hash, 64); + sph_gost512_close(&ctx_gost, (void*) hash); + + memcpy(output, hash, 32); +} + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + int dev_id = device_map[thr_id]; + + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + int intensity = (device_sm[device_map[thr_id]] > 500) ? 18 : 17; + if (strstr(device_name[dev_id], "GTX 10")) intensity = 19; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ptarget[7] = 0xf; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + quark_skein512_cpu_init(thr_id, throughput); + x11_cubehash512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1); + + init[thr_id] = true; + } + + uint32_t _ALIGN(64) h_resNonce[NBN]; + uint32_t _ALIGN(64) endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + skein512_cpu_setBlock_80(endiandata); + + cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); + streebog_set_target(ptarget); + + do { + int order = 0; + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); + + cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost); + + *hashes_done = pdata[19] - first_nonce + throughput; + + if (h_resNonce[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; + const uint32_t Htarg = ptarget[7]; + const uint32_t startNounce = pdata[19]; + + be32enc(&endiandata[19], startNounce + h_resNonce[0]); + skunk_hash(vhash, endiandata); + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) + { + work->nonces[0] = startNounce + h_resNonce[0]; + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + if (h_resNonce[1] != UINT32_MAX) + { + uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1]; + be32enc(&endiandata[19], secNonce); + skunk_hash(vhash, endiandata); + work->nonces[1] = secNonce; + if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) { + work_set_target_ratio(work, vhash); + xchg(work->nonces[1], work->nonces[0]); + } else { + bn_set_target_ratio(work, vhash, work->valid_nonces); + } + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); + pdata[19] = startNounce + h_resNonce[0] + 1; + continue; + } + } + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + + return 0; +} + +// cleanup +extern "C" void free_skunk(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + x13_fugue512_cpu_free(thr_id); + cudaFree(d_hash[thr_id]); + cudaFree(d_resNonce[thr_id]); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/util.cpp b/util.cpp index 99187fd..e601756 100644 --- a/util.cpp +++ b/util.cpp @@ -2270,6 +2270,9 @@ void print_hash_tests(void) skein2hash(&hash[0], &buf[0]); printpfx("skein2", hash); + skunk_hash(&hash[0], &buf[0]); + printpfx("skunk", hash); + s3hash(&hash[0], &buf[0]); printpfx("S3", hash); diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu index ba1afd8..b69ddb6 100644 --- a/x13/cuda_x13_fugue512.cu +++ b/x13/cuda_x13_fugue512.cu @@ -254,7 +254,7 @@ void x13_fugue512_gpu_hash_64(uint32_t threads, uint64_t *g_hash) mixtabs[thr+256] = ROR8(tmp); mixtabs[thr+512] = ROL16(tmp); mixtabs[thr+768] = ROL8(tmp); -#if TPB < 256 +#if TPB <= 256 if (blockDim.x < 256) { const uint32_t thr = (threadIdx.x + 0x80) & 0xFF; const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);