From 5d0c0a665df683df2021767baa70bf0d41b263f8 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 7 Apr 2018 12:27:07 +0200 Subject: [PATCH 01/24] x17: apply echo512 improvement add a tiny 1% on x17, better than nothing... --- x16/x16r.cu | 3 ++- x16/x16s.cu | 3 ++- x17/x17.cu | 18 +++++++++++++++--- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/x16/x16r.cu b/x16/x16r.cu index 1319c22..0a42be0 100644 --- a/x16/x16r.cu +++ b/x16/x16r.cu @@ -491,8 +491,9 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, case ECHO: if (use_compat_kernels[thr_id]) x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - else + else { x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } TRACE("echo :"); break; case HAMSI: diff --git a/x16/x16s.cu b/x16/x16s.cu index 36aeacb..080ff74 100644 --- a/x16/x16s.cu +++ b/x16/x16s.cu @@ -489,8 +489,9 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, case ECHO: if (use_compat_kernels[thr_id]) x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - else + else { x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } TRACE("echo :"); break; case HAMSI: diff --git a/x17/x17.cu b/x17/x17.cu index 816e5e0..3536cdc 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -32,6 +32,8 @@ extern "C" { static uint32_t *d_hash[MAX_GPUS]; +extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); + extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -151,12 +153,14 @@ extern "C" void x17hash(void *output, const void *input) } static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8; //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); @@ -166,7 +170,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage @@ -174,6 +178,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + quark_blake512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); @@ -183,7 +192,6 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u x11_luffaCubehash512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); x13_hamsi512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput); x14_shabal512_cpu_init(thr_id, throughput); @@ -220,7 +228,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else { + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); From 16ebe53b72d7c02f95fc948579876f08971bfc66 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 9 Apr 2018 17:45:20 +0200 Subject: [PATCH 02/24] x12: apply echo512 optimised kernel on recent cards --- x12/x12.cu | 20 ++++++++++++++++---- x16/x16r.cu | 2 +- x16/x16s.cu | 2 +- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/x12/x12.cu b/x12/x12.cu index 1cf862b..c0fd623 100644 --- a/x12/x12.cu +++ b/x12/x12.cu @@ -22,6 +22,8 @@ extern "C" { static uint32_t *d_hash[MAX_GPUS]; +extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); + extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -96,13 +98,15 @@ extern "C" void x12hash(void *output, const void *input) } static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19; + const int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; uint32_t throughput = cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8; //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); @@ -111,7 +115,7 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage @@ -120,13 +124,17 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + quark_blake512_cpu_init(thr_id, throughput); x11_luffaCubehash512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput); if (x11_simd512_cpu_init(thr_id, throughput) != 0) { return 0; } - x11_echo512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput); @@ -156,7 +164,11 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else { + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x16/x16r.cu b/x16/x16r.cu index 0a42be0..2caa5d0 100644 --- a/x16/x16r.cu +++ b/x16/x16r.cu @@ -250,7 +250,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage diff --git a/x16/x16s.cu b/x16/x16s.cu index 080ff74..382de41 100644 --- a/x16/x16s.cu +++ b/x16/x16s.cu @@ -248,7 +248,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage From bfcf7a9e52e9bf8277893b6e66dd038998d88610 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 22 Apr 2018 23:12:37 +0200 Subject: [PATCH 03/24] neoscrypt: add extra space for recent vstudio madness --- neoscrypt/cuda_neoscrypt.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu index 9ea3b75..59d73b7 100644 --- a/neoscrypt/cuda_neoscrypt.cu +++ b/neoscrypt/cuda_neoscrypt.cu @@ -179,7 +179,7 @@ static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2) idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ a += b; d = rotateL(d^a, 16); \ c += d; b = rotateR(b^c, 12); \ - idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \ a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } @@ -392,7 +392,7 @@ void Blake2S(uint32_t *out, const uint32_t* const __restrict__ inout, const ui idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ a += b; d = __byte_perm(d^a, 0, 0x1032); \ c += d; b = rotateR(b^c, 12); \ - idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \ a += b; d = __byte_perm(d^a, 0, 0x0321); \ c += d; b = rotateR(b^c, 7); \ } @@ -1260,7 +1260,7 @@ uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const sal idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ a += b; d = ROTR32(d^a,16); \ c += d; b = ROTR32(b^c, 12); \ - idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \ + idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; a += key[idx]; \ a += b; d = ROTR32(d^a,8); \ c += d; b = ROTR32(b^c, 7); \ } From b97567a451f0069612d20db71912f7b85996d52f Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 6 May 2018 18:04:10 +0200 Subject: [PATCH 04/24] allium algo --- Makefile.am | 1 + algos.h | 2 + bench.cpp | 1 + ccminer.cpp | 5 + ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 3 + configure.ac | 2 +- lyra2/allium.cu | 213 ++++++++++++++++++++++++++++++++++++++++ miner.h | 3 + util.cpp | 3 + 10 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 lyra2/allium.cu diff --git a/Makefile.am b/Makefile.am index d7d2a0b..8f33d48 100644 --- a/Makefile.am +++ b/Makefile.am @@ -39,6 +39,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \ lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \ + lyra2/allium.cu \ Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu \ Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ diff --git a/algos.h b/algos.h index ed0ff83..f141086 100644 --- a/algos.h +++ b/algos.h @@ -8,6 +8,7 @@ enum sha_algos { ALGO_BLAKECOIN = 0, ALGO_BLAKE, ALGO_BLAKE2S, + ALGO_ALLIUM, ALGO_BMW, ALGO_BASTION, ALGO_C11, @@ -80,6 +81,7 @@ static const char *algo_names[] = { "blakecoin", "blake", "blake2s", + "allium", "bmw", "bastion", "c11", diff --git a/bench.cpp b/bench.cpp index eeeee60..84f9bc5 100644 --- a/bench.cpp +++ b/bench.cpp @@ -49,6 +49,7 @@ void bench_free() void algo_free_all(int thr_id) { // only initialized algos will be freed + free_allium(thr_id); free_bastion(thr_id); free_bitcore(thr_id); free_blake256(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 87cd26c..770d5d5 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -1698,6 +1698,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_SCRYPT_JANE: work_set_target(work, sctx->job.diff / (65536.0 * opt_difficulty)); break; + case ALGO_ALLIUM: case ALGO_DMD_GR: case ALGO_FRESH: case ALGO_FUGUE256: @@ -2234,6 +2235,7 @@ static void *miner_thread(void *userdata) case ALGO_TRIBUS: minmax = 0x1000000; break; + case ALGO_ALLIUM: case ALGO_C11: case ALGO_DEEP: case ALGO_HEAVY: @@ -2323,6 +2325,9 @@ static void *miner_thread(void *userdata) /* scan nonces for a proof-of-work hash */ switch (opt_algo) { + case ALGO_ALLIUM: + rc = scanhash_allium(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_BASTION: rc = scanhash_bastion(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index f995f4a..1db063e 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -519,6 +519,7 @@ + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 4c1b8d6..b2ee453 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -910,6 +910,9 @@ Source Files\CUDA\Algo256 + + Source Files\CUDA\lyra2 + Source Files\CUDA\lyra2 diff --git a/configure.ac b/configure.ac index 08a340f..e164456 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.2.6], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/lyra2/allium.cu b/lyra2/allium.cu new file mode 100644 index 0000000..931e6bc --- /dev/null +++ b/lyra2/allium.cu @@ -0,0 +1,213 @@ +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_keccak.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_skein.h" +#include "sph/sph_groestl.h" +#include "lyra2/Lyra2.h" +} + +#include +#include + +static uint64_t* d_hash[MAX_GPUS]; +static uint64_t* d_matrix[MAX_GPUS]; + +extern void blake256_cpu_init(int thr_id, uint32_t threads); +extern void blake256_cpu_setBlock_80(uint32_t *pdata); +//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); + +//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +//extern void keccak256_sm3_init(int thr_id, uint32_t threads); +//extern void keccak256_sm3_free(int thr_id); + +extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); + +extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); + +extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); + +extern void skein256_cpu_init(int thr_id, uint32_t threads); + +extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); + +extern void groestl256_cpu_init(int thr_id, uint32_t threads); +extern void groestl256_cpu_free(int thr_id); +extern void groestl256_setTarget(const void *ptarget); +extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order); +extern uint32_t groestl256_getSecNonce(int thr_id, int num); + + +extern "C" void allium_hash(void *state, const void *input) +{ + uint32_t hashA[8], hashB[8]; + + sph_blake256_context ctx_blake; + sph_keccak256_context ctx_keccak; + sph_cubehash256_context ctx_cube; + sph_skein256_context ctx_skein; + sph_groestl256_context ctx_groestl; + + sph_blake256_set_rounds(14); + + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, input, 80); + sph_blake256_close(&ctx_blake, hashA); + + sph_keccak256_init(&ctx_keccak); + sph_keccak256(&ctx_keccak, hashA, 32); + sph_keccak256_close(&ctx_keccak, hashB); + + LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashA, 32); + sph_cubehash256_close(&ctx_cube, hashB); + + LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8); + + sph_skein256_init(&ctx_skein); + sph_skein256(&ctx_skein, hashA, 32); + sph_skein256_close(&ctx_skein, hashB); + + sph_groestl256_init(&ctx_groestl); + sph_groestl256(&ctx_groestl, hashB, 32); + sph_groestl256_close(&ctx_groestl, hashA); + + memcpy(state, hashA, 32); +} + +static bool init[MAX_GPUS] = { 0 }; +static __thread uint32_t throughput = 0; + +extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + + if (opt_benchmark) + ptarget[7] = 0x00ff; + + static __thread bool gtx750ti; + if (!init[thr_id]) + { + int dev_id = device_map[thr_id]; + cudaSetDevice(dev_id); + CUDA_LOG_ERROR(); + + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; + if (device_sm[device_map[thr_id]] == 500) intensity = 15; + throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, dev_id); + + if (strstr(props.name, "750 Ti")) gtx750ti = true; + else gtx750ti = false; + + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + blake256_cpu_init(thr_id, throughput); + //keccak256_sm3_init(thr_id, throughput); + skein256_cpu_init(thr_id, throughput); + groestl256_cpu_init(thr_id, throughput); + + //cuda_get_arch(thr_id); + if (device_sm[dev_id] >= 500) + { + size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); + lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + } + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + + init[thr_id] = true; + } + + uint32_t _ALIGN(128) endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + blake256_cpu_setBlock_80(pdata); + groestl256_setTarget(ptarget); + + do { + int order = 0; + + //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + + be32enc(&endiandata[19], work->nonces[0]); + allium_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + work->nonces[1] = groestl256_getSecNonce(thr_id, 1); + if (work->nonces[1] != UINT32_MAX) { + be32enc(&endiandata[19], work->nonces[1]); + allium_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_allium(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + cudaFree(d_matrix[thr_id]); + + //keccak256_sm3_free(thr_id); + groestl256_cpu_free(thr_id); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/miner.h b/miner.h index 6d90518..16f57ab 100644 --- a/miner.h +++ b/miner.h @@ -273,6 +273,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len); struct work; +extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds); extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -339,6 +340,7 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc /* free device allocated memory per algo */ void algo_free_all(int thr_id); +extern void free_allium(int thr_id); extern void free_bastion(int thr_id); extern void free_bitcore(int thr_id); extern void free_blake256(int thr_id); @@ -887,6 +889,7 @@ void applog_hash64(void *hash); void applog_compare_hash(void *hash, void *hash_ref); void print_hash_tests(void); +void allium_hash(void *state, const void *input); void bastionhash(void* output, const unsigned char* input); void blake256hash(void *output, const void *input, int8_t rounds); void blake2b_hash(void *output, const void *input); diff --git a/util.cpp b/util.cpp index dc20c2a..70dc626 100644 --- a/util.cpp +++ b/util.cpp @@ -2164,6 +2164,9 @@ void print_hash_tests(void) printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n"); + allium_hash(&hash[0], &buf[0]); + printpfx("allium", hash); + bastionhash(&hash[0], &buf[0]); printpfx("bastion", hash); From ffd6cf38bf43387cda0fab567ba360f637f8fa18 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 6 May 2018 18:16:35 +0200 Subject: [PATCH 05/24] update readme and win ver --- README.txt | 6 +++++- ccminer.cpp | 1 + compat/ccminer-config.h | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.txt b/README.txt index 59a2cec..af0718d 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccminer 2.2.5 (Apr 2018) "x12, x16r and x16s algos" +ccminer 2.2.6 (Under Dev) --------------------------------------------------------------- *************************************************************** @@ -73,6 +73,7 @@ This code is based on the pooler cpuminer and inherits its command line interface and options. -a, --algo=ALGO specify the algorithm to use + allium use to mine Garlic bastion use to mine Joincoin bitcore use to mine Bitcore's Timetravel10 blake use to mine Saffroncoin (Blake256) @@ -281,6 +282,9 @@ so we can more efficiently implement new algorithms using the latest hardware features. >>> RELEASE HISTORY <<< + 2018 v2.2.6 + New allium algo for Garlic + Apr. 02nd 2018 v2.2.5 New x16r algo for Raven New x16s algo for Pigeon and Eden diff --git a/ccminer.cpp b/ccminer.cpp index 770d5d5..a48b194 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -236,6 +236,7 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the hash algorithm to use\n\ + allium Garlic double lyra2\n\ bastion Hefty bastion\n\ bitcore Timetravel-10\n\ blake Blake 256 (SFR)\n\ diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index 17efd4c..5b36078 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -164,7 +164,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.2.5" +#define PACKAGE_VERSION "2.2.6" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be From 57f8f776fb1819d253e34f615d46a83c0490f77b Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 9 May 2018 14:57:49 +0200 Subject: [PATCH 06/24] timetravel: cleanup, remove unused algos + cubehash 80 midstate --- x11/cuda_x11_cubehash512.cu | 54 +++++++++++++++++++++++++----- x11/timetravel.cu | 67 +++---------------------------------- 2 files changed, 51 insertions(+), 70 deletions(-) diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu index f7ce97c..b5aa534 100644 --- a/x11/cuda_x11_cubehash512.cu +++ b/x11/cuda_x11_cubehash512.cu @@ -259,16 +259,32 @@ void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { } /***************************************************/ -#define WANT_CUBEHASH80 -#ifdef WANT_CUBEHASH80 +/** + * Timetravel and x16 CUBEHASH-80 CUDA implementation + * by tpruvot@github - Jan 2017 / May 2018 + */ -__constant__ -static uint32_t c_PaddedMessage80[20]; +__constant__ static uint32_t c_midstate128[32]; +__constant__ static uint32_t c_PaddedMessage80[20]; + +#undef SPH_C32 +#undef SPH_C64 +#undef SPH_T32 +#undef SPH_T64 +#include "sph/sph_cubehash.h" __host__ void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata) { + sph_cubehash512_context ctx_cubehash; + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64); +#ifndef NO_MIDSTATE + cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_PaddedMessage80, &endiandata[16], 16, 0, cudaMemcpyHostToDevice); +#else cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +#endif } __global__ @@ -278,11 +294,11 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, if (thread < threads) { const uint32_t nonce = startNounce + thread; - + uint32_t message[8]; uint32_t x[2][2][2][2][2]; +#ifdef NO_MIDSTATE Init(x); - uint32_t message[8]; // first 32 bytes AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]); AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]); @@ -293,8 +309,31 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]); Update32(x, message); - // last 16 bytes + Padding + // last 16 bytes AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]); +#else + AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]); + AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]); + AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]); + AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]); + AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]); + AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]); + AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]); + AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]); + + AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]); + AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]); + AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]); + AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]); + AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]); + AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]); + AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]); + AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]); + + // last 16 bytes + AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]); +#endif + // nonce + Padding message[3] = cuda_swab32(nonce); message[4] = 0x80; message[5] = 0; @@ -317,4 +356,3 @@ void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const ui cubehash512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*) d_hash); } -#endif \ No newline at end of file diff --git a/x11/timetravel.cu b/x11/timetravel.cu index 93c3fd1..8d157f2 100644 --- a/x11/timetravel.cu +++ b/x11/timetravel.cu @@ -20,11 +20,6 @@ extern "C" { #include "sph/sph_keccak.h" #include "sph/sph_luffa.h" #include "sph/sph_cubehash.h" -#if HASH_FUNC_COUNT > 8 -#include "sph/sph_shavite.h" -#include "sph/sph_simd.h" -#include "sph/sph_echo.h" -#endif } #include "miner.h" @@ -42,11 +37,6 @@ enum Algo { KECCAK, LUFFA, CUBEHASH, -#if HASH_FUNC_COUNT > 8 - SHAVITE, - SIMD, - ECHO, -#endif MAX_ALGOS_COUNT }; @@ -153,11 +143,6 @@ extern "C" void timetravel_hash(void *output, const void *input) sph_keccak512_context ctx_keccak; sph_luffa512_context ctx_luffa1; sph_cubehash512_context ctx_cubehash1; -#if HASH_FUNC_COUNT > 8 - sph_shavite512_context ctx_shavite1; - sph_simd512_context ctx_simd1; - sph_echo512_context ctx_echo1; -#endif if (s_sequence == UINT32_MAX) { uint32_t *data = (uint32_t*) input; @@ -175,11 +160,6 @@ extern "C" void timetravel_hash(void *output, const void *input) const char elem = hashOrder[i]; uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; - if (i > 0) { - in = (void*) hash; - size = 64; - } - switch (algo) { case BLAKE: sph_blake512_init(&ctx_blake); @@ -195,7 +175,6 @@ extern "C" void timetravel_hash(void *output, const void *input) sph_groestl512_init(&ctx_groestl); sph_groestl512(&ctx_groestl, in, size); sph_groestl512_close(&ctx_groestl, hash); - //applog_hex((void*)hash, 32); break; case SKEIN: sph_skein512_init(&ctx_skein); @@ -222,24 +201,10 @@ extern "C" void timetravel_hash(void *output, const void *input) sph_cubehash512(&ctx_cubehash1, in, size); sph_cubehash512_close(&ctx_cubehash1, hash); break; -#if HASH_FUNC_COUNT > 8 - case SHAVITE: - sph_shavite512_init(&ctx_shavite1); - sph_shavite512(&ctx_shavite1, in, size); - sph_shavite512_close(&ctx_shavite1, hash); - break; - case SIMD: - sph_simd512_init(&ctx_simd1); - sph_simd512(&ctx_simd1, in, size); - sph_simd512_close(&ctx_simd1, hash); - break; - case ECHO: - sph_echo512_init(&ctx_echo1); - sph_echo512(&ctx_echo1, in, size); - sph_echo512_close(&ctx_echo1, hash); - break; -#endif } + + in = (void*) hash; + size = 64; } memcpy(output, hash, 32); @@ -330,13 +295,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes) x11_luffa512_cpu_init(thr_id, throughput); x11_cubehash512_cpu_init(thr_id, throughput); -#if HASH_FUNC_COUNT > 8 - x11_shavite512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); - if (x11_simd512_cpu_init(thr_id, throughput) != 0) { - return 0; - } -#endif + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1); CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1); @@ -471,20 +430,6 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("cube :"); break; -#if HASH_FUNC_COUNT > 8 - case SHAVITE: - x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("shavite:"); - break; - case SIMD: - x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("simd :"); - break; - case ECHO: - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("echo :"); - break; -#endif } } @@ -544,9 +489,7 @@ extern "C" void free_timetravel(int thr_id) quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); -#if HASH_FUNC_COUNT > 8 - x11_simd512_cpu_free(thr_id); -#endif + cuda_check_cpu_free(thr_id); init[thr_id] = false; From a9357e1ec84fa0be584f126f9c852bc3372ada27 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 10 May 2018 06:31:25 +0200 Subject: [PATCH 07/24] lyra2: remove unused nonce param --- lyra2/allium.cu | 6 +++--- lyra2/cuda_lyra2.cu | 30 +++++++++++++++--------------- lyra2/cuda_lyra2_sm2.cuh | 4 ++-- lyra2/cuda_lyra2_sm5.cuh | 12 ++++++------ lyra2/lyra2RE.cu | 4 ++-- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/lyra2/allium.cu b/lyra2/allium.cu index 931e6bc..6492c92 100644 --- a/lyra2/allium.cu +++ b/lyra2/allium.cu @@ -30,7 +30,7 @@ extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t start extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti); extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_cpu_free(int thr_id); @@ -141,9 +141,9 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti); cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index 7905d23..a280200 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -409,7 +409,7 @@ __constant__ uint2x4 blake2b_IV[2] = { }; __global__ __launch_bounds__(64, 1) -void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -436,7 +436,7 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) __global__ __launch_bounds__(TPB52, 1) -void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) { const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; @@ -481,7 +481,7 @@ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_has } __global__ __launch_bounds__(64, 1) -void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) { const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; @@ -502,7 +502,7 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) g_hash[thread + threads * 2] = state[0].z; g_hash[thread + threads * 3] = state[0].w; - } //thread + } } #else #if __CUDA_ARCH__ < 500 @@ -510,9 +510,9 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) /* for unsupported SM arch */ __device__ void* DMatrix; #endif -__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} -__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} -__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {} +__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {} #endif __host__ @@ -523,7 +523,7 @@ void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) } __host__ -void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti) +void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx750ti) { int dev_id = device_map[thr_id % MAX_GPUS]; @@ -544,11 +544,11 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6 if (cuda_arch[dev_id] >= 520) { - lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash); + lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash); - lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash); } else if (cuda_arch[dev_id] >= 500) { @@ -561,12 +561,12 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6 // suitable amount to adjust for 10warp shared_mem = 6144; - lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash); + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); } else - lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash); + lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash); } diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh index 18263b2..da621d0 100644 --- a/lyra2/cuda_lyra2_sm2.cuh +++ b/lyra2/cuda_lyra2_sm2.cuh @@ -131,7 +131,7 @@ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, } __global__ __launch_bounds__(TPB30, 1) -void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -224,5 +224,5 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h #else /* if __CUDA_ARCH__ < 200 .. host */ -__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} +__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {} #endif diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh index fc13172..4a3caeb 100644 --- a/lyra2/cuda_lyra2_sm5.cuh +++ b/lyra2/cuda_lyra2_sm5.cuh @@ -589,7 +589,7 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr } __global__ __launch_bounds__(64, 1) -void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -622,7 +622,7 @@ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha } __global__ __launch_bounds__(TPB50, 1) -void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) { const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); @@ -662,7 +662,7 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha } __global__ __launch_bounds__(64, 1) -void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -687,7 +687,7 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha #else /* if __CUDA_ARCH__ != 500 .. host */ -__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} -__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} -__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {} #endif diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index b3ad49f..b435371 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -26,7 +26,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti); extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_cpu_free(int thr_id); @@ -130,7 +130,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; From b8190e4aa77c2925b0199f3fbfc22571f279925b Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 17 May 2018 19:26:00 +0200 Subject: [PATCH 08/24] allium: add missing device cpu flag for linux --- lyra2/allium.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lyra2/allium.cu b/lyra2/allium.cu index 6492c92..65dbbe3 100644 --- a/lyra2/allium.cu +++ b/lyra2/allium.cu @@ -95,7 +95,11 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce { int dev_id = device_map[thr_id]; cudaSetDevice(dev_id); - CUDA_LOG_ERROR(); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; if (device_sm[device_map[thr_id]] == 500) intensity = 15; From 3d03a1b9fd0a14c65f231ed65f9cebb267e63a4f Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 28 May 2018 15:21:00 +0200 Subject: [PATCH 09/24] phi2 algo --- Makefile.am | 2 +- algos.h | 2 + ccminer.cpp | 8 +- ccminer.vcxproj | 5 +- ccminer.vcxproj.filters | 20 ++- lyra2/cuda_lyra2.cu | 122 ++++++++++++++--- lyra2/cuda_lyra2_sm2.cuh | 65 ++++++++- lyra2/cuda_lyra2_sm5.cuh | 64 ++++++++- miner.h | 5 +- phi/cuda_phi2.cu | 89 ++++++++++++ {x11 => phi}/phi.cu | 8 +- phi/phi2.cu | 255 +++++++++++++++++++++++++++++++++++ util.cpp | 2 +- x11/cuda_streebog_maxwell.cu | 21 ++- x16/cuda_x16_echo512_64.cu | 26 +++- 15 files changed, 648 insertions(+), 46 deletions(-) create mode 100644 phi/cuda_phi2.cu rename {x11 => phi}/phi.cu (97%) create mode 100644 phi/phi2.cu diff --git a/Makefile.am b/Makefile.am index 8f33d48..5d5652c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -81,7 +81,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \ x16/cuda_x16_echo512_64.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ - x11/phi.cu x11/cuda_streebog_maxwell.cu \ + phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu # scrypt diff --git a/algos.h b/algos.h index f141086..229d8e9 100644 --- a/algos.h +++ b/algos.h @@ -39,6 +39,7 @@ enum sha_algos { ALGO_NIST5, ALGO_PENTABLAKE, ALGO_PHI, + ALGO_PHI2, ALGO_POLYTIMOS, ALGO_QUARK, ALGO_QUBIT, @@ -112,6 +113,7 @@ static const char *algo_names[] = { "nist5", "penta", "phi", + "phi2", "polytimos", "quark", "qubit", diff --git a/ccminer.cpp b/ccminer.cpp index a48b194..c1567a1 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -269,7 +269,8 @@ Options:\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ - phi BHCoin\n\ + phi LUX initial algo\n\ + phi2 LUX v2 with lyra2\n\ polytimos Politimos\n\ quark Quark\n\ qubit Qubit\n\ @@ -1708,6 +1709,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_LBRY: case ALGO_LYRA2v2: case ALGO_LYRA2Z: + case ALGO_PHI2: case ALGO_TIMETRAVEL: case ALGO_BITCORE: case ALGO_X16R: @@ -2245,6 +2247,7 @@ static void *miner_thread(void *userdata) case ALGO_HSR: case ALGO_LYRA2v2: case ALGO_PHI: + case ALGO_PHI2: case ALGO_POLYTIMOS: case ALGO_S3: case ALGO_SKUNK: @@ -2436,6 +2439,9 @@ static void *miner_thread(void *userdata) case ALGO_PHI: rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_PHI2: + rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_POLYTIMOS: rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 1db063e..f20449a 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -525,6 +525,7 @@ + @@ -537,6 +538,9 @@ 48 + + + compute_50,sm_50;compute_52,sm_52 @@ -567,7 +571,6 @@ - diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index b2ee453..96220ae 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -115,7 +115,10 @@ {1e548d79-c217-4203-989a-a592fe2b2de3} - + + {311e8d79-1612-4f0f-8591-23a592f2b2d3} + + {xde48d89-fx12-1323-129a-b592fe2b2de3} @@ -545,6 +548,9 @@ Source Files\CUDA\lyra2 + + Source Files\CUDA\lyra2 + Source Files\CUDA\lyra2 @@ -781,6 +787,15 @@ Source Files\CUDA + + Source Files\CUDA\phi + + + Source Files\CUDA\phi + + + Source Files\CUDA\phi + Source Files\CUDA\skunk @@ -799,9 +814,6 @@ Source Files\CUDA\tribus - - Source Files\CUDA\x11 - Source Files\CUDA\x11 diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index a280200..5cdb6ee 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -1,6 +1,7 @@ /** * Lyra2 (v1) cuda implementation based on djm34 work * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2) + * tpruvot@github 2018 for phi2 double lyra2-32 support */ #include @@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads) { uint2 state1[3]; -#if __CUDA_ARCH__ > 500 -#pragma unroll -#endif + #pragma unroll for (int i = 0; i < Nrow; i++) { ST4S(0, Ncol - i - 1, state, thread, threads); @@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin LD4S(state1, rowIn, i, thread, threads); LD4S(state2, rowInOut, i, thread, threads); -#pragma unroll + #pragma unroll for (int j = 0; j < 3; j++) state[j] ^= state1[j] + state2[j]; @@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin LD4S(state1, rowOut, i, thread, threads); -#pragma unroll + #pragma unroll for (int j = 0; j < 3; j++) state1[j] ^= state[j]; @@ -412,11 +411,9 @@ __global__ __launch_bounds__(64, 1) void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) { uint2x4 state[4]; - state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); @@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) __global__ __launch_bounds__(TPB52, 1) -void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) +void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash) { const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; - if (thread < threads) { uint2 state[4]; @@ -484,11 +480,9 @@ __global__ __launch_bounds__(64, 1) void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) { const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; - - uint28 state[4]; - if (thread < threads) { + uint2x4 state[4]; state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); @@ -501,7 +495,57 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) g_hash[thread + threads * 1] = state[0].y; g_hash[thread + threads * 2] = state[0].z; g_hash[thread + threads * 3] = state[0].w; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2x4 state[4]; + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *psrc = (uint2*)(&d_hash_512[offset]); + state[0].x = state[1].x = __ldg(&psrc[0]); + state[0].y = state[1].y = __ldg(&psrc[1]); + state[0].z = state[1].z = __ldg(&psrc[2]); + state[0].w = state[1].w = __ldg(&psrc[3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + for (int i = 0; i<24; i++) + round_lyra(state); + + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) +{ + // This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + uint2x4 state[4]; + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *pdst = (uint2*)(&d_hash_512[offset]); + pdst[0] = state[0].x; + pdst[1] = state[0].y; + pdst[2] = state[0].z; + pdst[3] = state[0].w; } } #else @@ -513,6 +557,8 @@ __device__ void* DMatrix; __global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {} __global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {} +__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {} #endif __host__ @@ -545,9 +591,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7 if (cuda_arch[dev_id] >= 520) { lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash); - lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash); } else if (cuda_arch[dev_id] >= 500) @@ -562,11 +606,57 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7 shared_mem = 6144; lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); } else lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash); } + +__host__ +void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti) +{ + int dev_id = device_map[thr_id % MAX_GPUS]; + uint32_t tpb = TPB52; + if (cuda_arch[dev_id] >= 520) tpb = TPB52; + else if (cuda_arch[dev_id] >= 500) tpb = TPB50; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; + + dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); + + dim3 grid2((threads + 64 - 1) / 64); + dim3 block2(64); + + if (cuda_arch[dev_id] >= 520) + { + const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152; + lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256); + lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + + lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256); + lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + } + else if (cuda_arch[dev_id] >= 500) + { + size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps + lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256); + lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + + lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256); + lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + } + else { + // alternative method for SM 3.x + hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0); + lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti); + hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0); + hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1); + lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti); + hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1); + } +} diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh index da621d0..cc0bd82 100644 --- a/lyra2/cuda_lyra2_sm2.cuh +++ b/lyra2/cuda_lyra2_sm2.cuh @@ -3,7 +3,7 @@ #ifdef __INTELLISENSE__ /* just for vstudio code colors, only uncomment that temporary, dont commit it */ //#undef __CUDA_ARCH__ -//#define __CUDA_ARCH__ 500 +//#define __CUDA_ARCH__ 300 #endif #include "cuda_helper.h" @@ -226,3 +226,66 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) /* if __CUDA_ARCH__ < 200 .. host */ __global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {} #endif + +// ------------------------------------------------------------------------------------------------------------------------- + +// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo... + +#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350 + +__global__ __launch_bounds__(128, 8) +void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const size_t offset = (size_t) 16 * thread + (round * 8U); + uint2 *psrc = (uint2*) (&d_hash64[offset]); + uint2 *pdst = (uint2*) (&d_hash_lyra[thread]); + pdst[threads*0] = __ldg(&psrc[0]); + pdst[threads*1] = __ldg(&psrc[1]); + pdst[threads*2] = __ldg(&psrc[2]); + pdst[threads*3] = __ldg(&psrc[3]); + } +} + +__global__ __launch_bounds__(128, 8) +void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const size_t offset = (size_t) 16 * thread + (round * 8U); + uint2 *psrc = (uint2*) (&d_hash_lyra[thread]); + uint2 *pdst = (uint2*) (&d_hash64[offset]); + pdst[0] = psrc[0]; + pdst[1] = psrc[threads*1]; + pdst[2] = psrc[threads*2]; + pdst[3] = psrc[threads*3]; + } +} +#else +/* if __CUDA_ARCH__ < 200 .. host */ +__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {} +__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {} +#endif + +__host__ +void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + hash64_to_lyra32_gpu <<>> (threads, d_hash64, (uint2*) d_hash_lyra, round); +} + +__host__ +void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + hash64_from_lyra32_gpu <<>> (threads, d_hash64, (uint2*) d_hash_lyra, round); +} diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh index 4a3caeb..85adfd9 100644 --- a/lyra2/cuda_lyra2_sm5.cuh +++ b/lyra2/cuda_lyra2_sm5.cuh @@ -591,13 +591,12 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr __global__ __launch_bounds__(64, 1) void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - const uint2x4 blake2b_IV[2] = { { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } }; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint2x4 state[4]; @@ -629,7 +628,6 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) if (thread < threads) { uint2 state[4]; - state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]); state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]); state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]); @@ -669,7 +667,6 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) if (thread < threads) { uint2x4 state[4]; - state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]); state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]); state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]); @@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) } } +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) +{ + const uint2x4 blake2b_IV[2] = { + { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, + { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } + }; + // This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2x4 state[4]; + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *psrc = (uint2*)(&d_hash_512[offset]); + state[0].x = state[1].x = __ldg(&psrc[0]); + state[0].y = state[1].y = __ldg(&psrc[1]); + state[0].z = state[1].z = __ldg(&psrc[2]); + state[0].w = state[1].w = __ldg(&psrc[3]); + + state[1] = state[0]; + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i<24; i++) + round_lyra(state); + + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) +{ + // This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + uint2x4 state[4]; + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *pdst = (uint2*)(&d_hash_512[offset]); + pdst[0] = state[0].x; + pdst[1] = state[0].y; + pdst[2] = state[0].z; + pdst[3] = state[0].w; + } +} #else /* if __CUDA_ARCH__ != 500 .. host */ __global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {} +__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {} #endif diff --git a/miner.h b/miner.h index 16f57ab..d3118dc 100644 --- a/miner.h +++ b/miner.h @@ -303,6 +303,7 @@ extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -371,6 +372,7 @@ extern void free_neoscrypt(int thr_id); extern void free_nist5(int thr_id); extern void free_pentablake(int thr_id); extern void free_phi(int thr_id); +extern void free_phi2(int thr_id); extern void free_polytimos(int thr_id); extern void free_quark(int thr_id); extern void free_qubit(int thr_id); @@ -918,7 +920,8 @@ void myriadhash(void *state, const void *input); void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void nist5hash(void *state, const void *input); void pentablakehash(void *output, const void *input); -void phihash(void *output, const void *input); +void phi_hash(void *output, const void *input); +void phi2_hash(void *output, const void *input); void polytimos_hash(void *output, const void *input); void quarkhash(void *state, const void *input); void qubithash(void *state, const void *input); diff --git a/phi/cuda_phi2.cu b/phi/cuda_phi2.cu new file mode 100644 index 0000000..a0bcf6d --- /dev/null +++ b/phi/cuda_phi2.cu @@ -0,0 +1,89 @@ +#include +#include + +#include "cuda_helper.h" + +__global__ __launch_bounds__(128, 8) +void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t); + uint4 *psrc = (uint4*) (&d_hash[offset]); + d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1; + if (d_NonceBranch[thread]) return; + if (d_branch2) { + uint4 *pdst = (uint4*)(&d_branch2[offset]); + uint4 data; + data = psrc[0]; pdst[0] = data; + data = psrc[1]; pdst[1] = data; + data = psrc[2]; pdst[2] = data; + data = psrc[3]; pdst[3] = data; + } + } +} + +__global__ __launch_bounds__(128, 8) +void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads && !d_NonceBranch[thread]) + { + const uint32_t offset = thread * 16U; + uint4 *psrc = (uint4*) (&d_branch2[offset]); + uint4 *pdst = (uint4*) (&d_hash[offset]); + uint4 data; + data = psrc[0]; pdst[0] = data; + data = psrc[1]; pdst[1] = data; + data = psrc[2]; pdst[2] = data; + data = psrc[3]; pdst[3] = data; + } +} + +__global__ +void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 16U; + uint2 *psrc = (uint2*) (&d_hash[offset]); + uint2 *pdst = (uint2*) (&d_hash[offset]); + uint2 data; + data = psrc[4]; pdst[0] ^= data; + data = psrc[5]; pdst[1] ^= data; + data = psrc[6]; pdst[2] ^= data; + data = psrc[7]; pdst[3] ^= data; + } +} + +__host__ +uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // extract algo permution hashes to a second branch buffer + phi_filter_gpu <<>> (threads, inpHashes, d_br2, d_nonces); + return threads; +} + +__host__ +void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // put back second branch hashes to the common buffer d_hash + phi_merge_gpu <<>> (threads, outpHashes, d_br2, d_nonces); +} + +__host__ +void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + phi_final_compress_gpu <<>> (threads, d_hashes); +} diff --git a/x11/phi.cu b/phi/phi.cu similarity index 97% rename from x11/phi.cu rename to phi/phi.cu index ab1f308..ba2a967 100644 --- a/x11/phi.cu +++ b/phi/phi.cu @@ -19,7 +19,7 @@ extern "C" { #include "miner.h" #include "cuda_helper.h" -#include "cuda_x11.h" +#include "x11/cuda_x11.h" extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap); @@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_resNonce[MAX_GPUS]; -extern "C" void phihash(void *output, const void *input) +extern "C" void phi_hash(void *output, const void *input) { unsigned char _ALIGN(128) hash[128] = { 0 }; @@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u uint32_t _ALIGN(64) vhash[8]; if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce; be32enc(&endiandata[19], work->nonces[0]); - phihash(vhash, endiandata); + phi_hash(vhash, endiandata); if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { work->valid_nonces = 1; @@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u if (work->nonces[1] != UINT32_MAX) { work->nonces[1] += startNonce; be32enc(&endiandata[19], work->nonces[1]); - phihash(vhash, endiandata); + phi_hash(vhash, endiandata); bn_set_target_ratio(work, vhash, 1); work->valid_nonces++; pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; diff --git a/phi/phi2.cu b/phi/phi2.cu new file mode 100644 index 0000000..537217f --- /dev/null +++ b/phi/phi2.cu @@ -0,0 +1,255 @@ +// +// PHI2 algo +// CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein +// +// Implemented by tpruvot in May 2018 +// + +extern "C" { +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_streebog.h" +#include "sph/sph_echo.h" +#include "lyra2/Lyra2.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "x11/cuda_x11.h" + +#include +#include + +extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata); +extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); + +extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); +extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti); + +extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter); +extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter); + +extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces); +extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces); +extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes); + +static uint64_t* d_matrix[MAX_GPUS]; +static uint32_t* d_hash_512[MAX_GPUS]; +static uint64_t* d_hash_256[MAX_GPUS]; +static uint32_t* d_hash_br2[MAX_GPUS]; +static uint32_t* d_nonce_br[MAX_GPUS]; + +extern "C" void phi2_hash(void *output, const void *input) +{ + unsigned char _ALIGN(128) hash[128] = { 0 }; + unsigned char _ALIGN(128) hashA[64] = { 0 }; + unsigned char _ALIGN(128) hashB[64] = { 0 }; + + sph_cubehash512_context ctx_cubehash; + sph_jh512_context ctx_jh; + sph_gost512_context ctx_gost; + sph_echo512_context ctx_echo; + sph_skein512_context ctx_skein; + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, input, 80); + sph_cubehash512_close(&ctx_cubehash, (void*)hashB); + + LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8); + LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8); + + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, (const void*)hashA, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + if (hash[0] & 1) { + sph_gost512_init(&ctx_gost); + sph_gost512(&ctx_gost, (const void*)hash, 64); + sph_gost512_close(&ctx_gost, (void*)hash); + } else { + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + } + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + for (int i=0; i<32; i++) + hash[i] ^= hash[i+32]; + + memcpy(output, hash, 32); +} + +//#define _DEBUG +#define _DEBUG_PREFIX "phi-" +#include "cuda_debug.cuh" + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; +static __thread bool gtx750ti = false; + +extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; + if (device_sm[dev_id] == 500) intensity = 15; + if (device_sm[dev_id] == 600) intensity = 17; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem + + if (opt_benchmark) + ptarget[7] = 0xff; + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL); + + size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4; + CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1); + if (use_compat_kernels[thr_id]) { + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1); + } + + x11_cubehash512_cpu_init(thr_id, throughput); + lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + quark_jh512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput); + + cuda_check_cpu_init(thr_id, throughput); + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k = 0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + cuda_check_cpu_setTarget(ptarget); + cubehash512_setBlock_80(thr_id, endiandata); + + do { + int order = 0; + + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); order++; + TRACE("cube "); + + lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti); + order++; + TRACE("lyra "); + + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++); + TRACE("jh "); + + order++; + if (!use_compat_kernels[thr_id]) { + phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]); + phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); + phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); + phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); + } else { + // todo: nonces vector to reduce amount of hashes to compute + phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]); + streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order); + phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]); + } + TRACE("mix "); + + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++); + TRACE("skein "); + + phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]); + TRACE("xor "); + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]); + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + phi2_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + *hashes_done = pdata[19] - first_nonce + throughput; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + phi2_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + if (pdata[19] > max_nonce) pdata[19] = max_nonce; + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_phi2(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + cudaFree(d_matrix[thr_id]); + cudaFree(d_hash_512[thr_id]); + cudaFree(d_hash_256[thr_id]); + cudaFree(d_nonce_br[thr_id]); + if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]); + + cuda_check_cpu_free(thr_id); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/util.cpp b/util.cpp index 70dc626..ee1c1ee 100644 --- a/util.cpp +++ b/util.cpp @@ -2250,7 +2250,7 @@ void print_hash_tests(void) pentablakehash(&hash[0], &buf[0]); printpfx("pentablake", hash); - phihash(&hash[0], &buf[0]); + phi2_hash(&hash[0], &buf[0]); printpfx("phi", hash); polytimos_hash(&hash[0], &buf[0]); diff --git a/x11/cuda_streebog_maxwell.cu b/x11/cuda_streebog_maxwell.cu index 6a06332..4ff580b 100644 --- a/x11/cuda_streebog_maxwell.cu +++ b/x11/cuda_streebog_maxwell.cu @@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3) #else __launch_bounds__(TPB, 3) #endif -void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) +void streebog_gpu_hash_64_sm5(uint64_t *g_hash, uint32_t* const d_filter, const uint32_t filter_val) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint2 buf[8], t[8], temp[8], K0[8], hash[8]; @@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]); shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]); + //__threadfence_block(); + __syncthreads(); + + if (d_filter && d_filter[thread] != filter_val) return; + uint64_t* inout = &g_hash[thread<<3]; *(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]); *(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]); - __threadfence_block(); - K0[0] = vectorize(0x74a5d4ce2efc83b3); #pragma unroll 8 @@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) } __host__ -void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash) +void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *g_hash) +{ + dim3 grid((threads + TPB-1) / TPB); + dim3 block(TPB); + streebog_gpu_hash_64_sm5 <<>> ((uint64_t*)g_hash, NULL, 0); +} + +__host__ +void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter) { dim3 grid((threads + TPB-1) / TPB); dim3 block(TPB); - streebog_gpu_hash_64_maxwell <<>> ((uint64_t*)d_hash); + streebog_gpu_hash_64_sm5 <<>> ((uint64_t*)g_hash, d_filter, 1); } diff --git a/x16/cuda_x16_echo512_64.cu b/x16/cuda_x16_echo512_64.cu index ac18ff6..3a0f268 100644 --- a/x16/cuda_x16_echo512_64.cu +++ b/x16/cuda_x16_echo512_64.cu @@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, } __global__ __launch_bounds__(128, 5) /* will force 80 registers */ -static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) +static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val) { __shared__ uint32_t sharedMemory[4][256]; aes_gpu_init128(sharedMemory); + __syncthreads(); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t k0; @@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) uint32_t hash[16]; if (thread < threads) { + // phi2 filter (2 hash chain branches) + if (d_filter && d_filter[thread] != filter_val) return; + uint32_t *Hash = &g_hash[thread<<4]; *(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]); @@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) *(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0]; *(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8]; - __syncthreads(); - const uint32_t P[48] = { 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, //8-12 @@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) W[48 + i + 4] = a ^ cd ^ bcx; W[48 + i + 8] = d ^ ab ^ cdx; W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; - } for (int k = 1; k < 10; k++) @@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) } __host__ -void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ - +void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) +{ const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x16_echo512_gpu_hash_64<<>>(threads, d_hash); + x16_echo512_gpu_hash_64 <<>> (threads, d_hash, NULL, 0); } + +__host__ +void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter) +{ + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + x16_echo512_gpu_hash_64 <<>> (threads, g_hash, d_filter, 0); +} \ No newline at end of file From 07859f93cef68072d8011f3f9a60d443ae11289e Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 10 Jun 2018 18:32:37 +0200 Subject: [PATCH 10/24] update windows version + 2.2.6 readme --- README.txt | 7 ++++--- res/ccminer.rc | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/README.txt b/README.txt index af0718d..0862870 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccminer 2.2.6 (Under Dev) +ccminer 2.2.6 "phi2 and allium" --------------------------------------------------------------- *************************************************************** @@ -103,7 +103,7 @@ its command line interface and options. neoscrypt use to mine FeatherCoin, Trezarcoin, Orbitcoin, etc nist5 use to mine TalkCoin penta use to mine Joincoin / Pentablake - phi use to mine LUXCoin + phi2 use to mine LUXCoin polytimos use to mine Polytimos quark use to mine Quarkcoin qubit use to mine Qubit @@ -282,7 +282,8 @@ so we can more efficiently implement new algorithms using the latest hardware features. >>> RELEASE HISTORY <<< - 2018 v2.2.6 + June 10th 2018 v2.2.6 + New phi2 algo for LUX New allium algo for Garlic Apr. 02nd 2018 v2.2.5 diff --git a/res/ccminer.rc b/res/ccminer.rc index e031f82..78be94c 100644 --- a/res/ccminer.rc +++ b/res/ccminer.rc @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" // VS_VERSION_INFO VERSIONINFO - FILEVERSION 2,2,5,0 - PRODUCTVERSION 2,2,5,0 + FILEVERSION 2,2,6,0 + PRODUCTVERSION 2,2,6,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x21L @@ -76,10 +76,10 @@ BEGIN BEGIN BLOCK "040904e4" BEGIN - VALUE "FileVersion", "2.2.5" + VALUE "FileVersion", "2.2.6" VALUE "LegalCopyright", "Copyright (C) 2018" VALUE "ProductName", "ccminer" - VALUE "ProductVersion", "2.2.5" + VALUE "ProductVersion", "2.2.6" END END BLOCK "VarFileInfo" From 9fd5b04af628dc395a964ee4bb3126fcdd5f65da Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 18 Jun 2018 08:31:55 +0200 Subject: [PATCH 11/24] phi2: handle stratum sc hashes --- Makefile.am | 2 +- ccminer.cpp | 23 ++- ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 3 + configure.ac | 2 +- equi/equi-stratum.cpp | 2 +- miner.h | 2 +- phi/cuda_phi2_cubehash512.cu | 319 +++++++++++++++++++++++++++++++++++ phi/phi2.cu | 35 ++-- util.cpp | 18 +- 10 files changed, 386 insertions(+), 21 deletions(-) create mode 100644 phi/cuda_phi2_cubehash512.cu diff --git a/Makefile.am b/Makefile.am index 5d5652c..80a80c8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -81,7 +81,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \ x16/cuda_x16_echo512_64.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ - phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \ + phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu phi/cuda_phi2_cubehash512.cu x11/cuda_streebog_maxwell.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu # scrypt diff --git a/ccminer.cpp b/ccminer.cpp index c1567a1..7f01a80 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -103,6 +103,7 @@ bool submit_old = false; bool use_syslog = false; bool use_colors = true; int use_pok = 0; +int use_roots = 0; static bool opt_background = false; bool opt_quiet = false; int opt_maxlograte = 3; @@ -698,6 +699,10 @@ static bool work_decode(const json_t *val, struct work *work) data_size = 192; adata_sz = 180/4; break; + case ALGO_PHI2: + data_size = 144; + adata_sz = data_size / 4; + break; case ALGO_NEOSCRYPT: case ALGO_ZR5: data_size = 80; @@ -743,6 +748,12 @@ static bool work_decode(const json_t *val, struct work *work) for (i = 0; i < atarget_sz; i++) work->target[i] = le32dec(work->target + i); + if (opt_algo == ALGO_PHI2) { + for (i = 20; i < 36; i++) if (work->data[i]) { + use_roots = 1; break; + } + } + if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo) calc_network_diff(work); @@ -1066,6 +1077,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work) else if (opt_algo == ALGO_DECRED) { data_size = 192; adata_sz = 180/4; } + else if (opt_algo == ALGO_PHI2 && use_roots) { + data_size = 144; adata_sz = 36; + } else if (opt_algo == ALGO_SIA) { return sia_submit(curl, pool, work); } @@ -1629,10 +1643,17 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) for (i = 0; i < 8; i++) work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); for (i = 0; i < 8; i++) - work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i]; + work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i]; work->data[25] = le32dec(sctx->job.ntime); work->data[26] = le32dec(sctx->job.nbits); work->data[28] = 0x80000000; + } else if (opt_algo == ALGO_PHI2) { + for (i = 0; i < 8; i++) + work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); + work->data[17] = le32dec(sctx->job.ntime); + work->data[18] = le32dec(sctx->job.nbits); + for (i = 0; i < 16; i++) + work->data[20 + i] = be32dec((uint32_t*)sctx->job.extra + i); } else if (opt_algo == ALGO_SIA) { uint32_t extra = 0; memcpy(&extra, &sctx->job.coinbase[32], 2); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index f20449a..c0aa954 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -541,6 +541,7 @@ + compute_50,sm_50;compute_52,sm_52 diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 96220ae..667331a 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -796,6 +796,9 @@ Source Files\CUDA\phi + + Source Files\CUDA\phi + Source Files\CUDA\skunk diff --git a/configure.ac b/configure.ac index e164456..5489e9c 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.2.6], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.2.7], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp index 403c185..26433cc 100644 --- a/equi/equi-stratum.cpp +++ b/equi/equi-stratum.cpp @@ -101,7 +101,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params) target_be[31-i] = target_bin[i]; if (target_bin[i]) filled++; } - memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field + memcpy(sctx->job.extra, target_be, 32); pthread_mutex_lock(&stratum_work_lock); sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be); diff --git a/miner.h b/miner.h index d3118dc..2853906 100644 --- a/miner.h +++ b/miner.h @@ -669,7 +669,7 @@ struct stratum_job { unsigned char version[4]; unsigned char nbits[4]; unsigned char ntime[4]; - unsigned char claim[32]; // lbry + unsigned char extra[64]; // like lbry claimtrie bool clean; unsigned char nreward[2]; uint32_t height; diff --git a/phi/cuda_phi2_cubehash512.cu b/phi/cuda_phi2_cubehash512.cu new file mode 100644 index 0000000..e0e7fd7 --- /dev/null +++ b/phi/cuda_phi2_cubehash512.cu @@ -0,0 +1,319 @@ +/* phi2 cubehash-512 144-bytes input (80 + 64) */ + +#include +#include + +#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ +#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ + +#if __CUDA_ARCH__ < 350 +#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) +#else +#define LROT(x, bits) __funnelshift_l(x, x, bits) +#endif + +#define ROTATEUPWARDS7(a) LROT(a,7) +#define ROTATEUPWARDS11(a) LROT(a,11) + +#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } + +#ifdef NO_MIDSTATE + +__device__ __constant__ +static const uint32_t c_IV_512[32] = { + 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, + 0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695, + 0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537, + 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE, + 0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532, + 0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, + 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576, + 0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44 +}; + +#endif + +__device__ __forceinline__ +static void rrounds(uint32_t x[2][2][2][2][2]) +{ + int r; + int j; + int k; + int l; + int m; + +//#pragma unroll 16 + for (r = 0;r < CUBEHASH_ROUNDS;++r) { + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + SWAP(x[0][0][k][l][m],x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (m = 0;m < 2;++m) + SWAP(x[1][j][k][0][m],x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + SWAP(x[0][j][0][l][m],x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) + SWAP(x[1][j][k][l][0],x[1][j][k][l][1]) + + } +} + +__device__ __forceinline__ +static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2]) +{ + // read 32 bytes input from global mem with uint2 chunks + AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]); + AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]); + AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]); + AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]); +} + +__device__ __forceinline__ +static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2]) +{ + // used to write final hash to global mem + AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]); + AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]); + AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]); + AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]); + AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]); + AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]); + AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]); + AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]); +} + +#define Init(x) \ + AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \ + AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \ + AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \ + AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \ + AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \ + AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \ + AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \ + AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \ + AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \ + AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \ + AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \ + AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \ + AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \ + AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \ + AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \ + AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]); + +__device__ __forceinline__ +static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data) +{ + /* "xor the block into the first b bytes of the state" */ + block_tox(data, x); + /* "and then transform the state invertibly through r identical rounds" */ + rrounds(x); +} + +__device__ __forceinline__ +static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) +{ + /* "the integer 1 is xored into the last state word x_11111" */ + x[1][1][1][1][1] ^= 1; + + /* "the state is then transformed invertibly through 10r identical rounds" */ + #pragma unroll 10 + for (int i = 0; i < 10; i++) rrounds(x); + + /* "output the first h/8 bytes of the state" */ + hash_fromx(hashval, x); +} + +__host__ void phi2_cubehash512_cpu_init(int thr_id, uint32_t threads) { } + +/***************************************************/ + +/** + * Timetravel and x16 CUBEHASH-80 CUDA implementation + * by tpruvot@github - Jan 2017 / May 2018 + */ + +__constant__ static uint32_t c_midstate128[32]; +__constant__ static uint32_t c_PaddedMessage_144[36]; + +#undef SPH_C32 +#undef SPH_C64 +#undef SPH_T32 +#undef SPH_T64 +#include "sph/sph_cubehash.h" + +__host__ +void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata) +{ + sph_cubehash512_context ctx_cubehash; + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64); +#ifndef NO_MIDSTATE + cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice); +#endif + cudaMemcpyToSymbol(c_PaddedMessage_144, endiandata, sizeof(c_PaddedMessage_144), 0, cudaMemcpyHostToDevice); +} + +__global__ +void cubehash512_gpu_hash_144(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNounce + thread; + uint32_t message[8]; + uint32_t x[2][2][2][2][2]; +#ifdef NO_MIDSTATE + Init(x); + + // first 32 bytes + AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[0]); + AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[4]); + Update32(x, message); + + // second 32 bytes + AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[8]); + AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[12]); + Update32(x, message); +#else + AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]); + AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]); + AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]); + AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]); + AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]); + AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]); + AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]); + AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]); + + AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]); + AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]); + AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]); + AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]); + AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]); + AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]); + AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]); + AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]); +#endif + // nonce + state root + AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[16]); + message[3] = cuda_swab32(nonce); + AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[20]); // state + Update32(x, message); + + AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[24]); // state + AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[28]); // utxo + Update32(x, message); + + AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[32]); // utxo + message[4] = 0x80; + message[5] = 0; + message[6] = 0; + message[7] = 0; + Update32(x, message); + + uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]); + Final(x, output); + } +} + +__host__ +void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 256; + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + cubehash512_gpu_hash_144 <<>> (threads, startNounce, (uint64_t*) d_hash); +} + diff --git a/phi/phi2.cu b/phi/phi2.cu index 537217f..fbdb9c4 100644 --- a/phi/phi2.cu +++ b/phi/phi2.cu @@ -1,5 +1,5 @@ // -// PHI2 algo +// PHI2 algo (with smart contracts header) // CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein // // Implemented by tpruvot in May 2018 @@ -24,6 +24,9 @@ extern "C" { extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata); extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); +extern void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata); +extern void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); + extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti); @@ -41,11 +44,13 @@ static uint64_t* d_hash_256[MAX_GPUS]; static uint32_t* d_hash_br2[MAX_GPUS]; static uint32_t* d_nonce_br[MAX_GPUS]; +static bool has_roots; + extern "C" void phi2_hash(void *output, const void *input) { - unsigned char _ALIGN(128) hash[128] = { 0 }; - unsigned char _ALIGN(128) hashA[64] = { 0 }; - unsigned char _ALIGN(128) hashB[64] = { 0 }; + unsigned char _ALIGN(128) hash[64]; + unsigned char _ALIGN(128) hashA[64]; + unsigned char _ALIGN(128) hashB[64]; sph_cubehash512_context ctx_cubehash; sph_jh512_context ctx_jh; @@ -54,7 +59,7 @@ extern "C" void phi2_hash(void *output, const void *input) sph_skein512_context ctx_skein; sph_cubehash512_init(&ctx_cubehash); - sph_cubehash512(&ctx_cubehash, input, 80); + sph_cubehash512(&ctx_cubehash, input, has_roots ? 144 : 80); sph_cubehash512_close(&ctx_cubehash, (void*)hashB); LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8); @@ -137,7 +142,6 @@ extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1); } - x11_cubehash512_cpu_init(thr_id, throughput); lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); quark_jh512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); @@ -147,17 +151,26 @@ extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, init[thr_id] = true; } - uint32_t endiandata[20]; - for (int k = 0; k < 20; k++) + has_roots = false; + uint32_t endiandata[36]; + for (int k = 0; k < 36; k++) { be32enc(&endiandata[k], pdata[k]); + if (k >= 20 && pdata[k]) has_roots = true; + } cuda_check_cpu_setTarget(ptarget); - cubehash512_setBlock_80(thr_id, endiandata); + if (has_roots) + cubehash512_setBlock_144(thr_id, endiandata); + else + cubehash512_setBlock_80(thr_id, endiandata); do { int order = 0; - - cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); order++; + if (has_roots) + cubehash512_cuda_hash_144(thr_id, throughput, pdata[19], d_hash_512[thr_id]); + else + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); + order++; TRACE("cube "); lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti); diff --git a/util.cpp b/util.cpp index ee1c1ee..49cd854 100644 --- a/util.cpp +++ b/util.cpp @@ -1442,7 +1442,7 @@ static uint32_t getblocheight(struct stratum_ctx *sctx) static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) { const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime; - const char *claim = NULL, *nreward = NULL; + const char *extradata = NULL, *nreward = NULL; size_t coinb1_size, coinb2_size; bool clean, ret = false; int merkle_count, i, p=0; @@ -1452,7 +1452,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) int ntime; char algo[64] = { 0 }; get_currentalgo(algo, sizeof(algo)); - bool has_claim = !strcasecmp(algo, "lbry"); + bool has_claim = !strcmp(algo, "lbry"); + bool has_roots = !strcmp(algo, "phi2") && json_array_size(params) == 10; if (sctx->is_equihash) { return equi_stratum_notify(sctx, params); @@ -1461,11 +1462,17 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) job_id = json_string_value(json_array_get(params, p++)); prevhash = json_string_value(json_array_get(params, p++)); if (has_claim) { - claim = json_string_value(json_array_get(params, p++)); - if (!claim || strlen(claim) != 64) { + extradata = json_string_value(json_array_get(params, p++)); + if (!extradata || strlen(extradata) != 64) { applog(LOG_ERR, "Stratum notify: invalid claim parameter"); goto out; } + } else if (has_roots) { + extradata = json_string_value(json_array_get(params, p++)); + if (!extradata || strlen(extradata) != 128) { + applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter"); + goto out; + } } coinb1 = json_string_value(json_array_get(params, p++)); coinb2 = json_string_value(json_array_get(params, p++)); @@ -1529,7 +1536,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) free(sctx->job.job_id); sctx->job.job_id = strdup(job_id); hex2bin(sctx->job.prevhash, prevhash, 32); - if (has_claim) hex2bin(sctx->job.claim, claim, 32); + if (has_claim) hex2bin(sctx->job.extra, extradata, 32); + if (has_roots) hex2bin(sctx->job.extra, extradata, 64); sctx->job.height = getblocheight(sctx); From 968d2ba0499ac49cd67518a4d8e3d5e9d017c6bb Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 21 Jun 2018 10:11:22 +0200 Subject: [PATCH 12/24] phi2: fix the double endian swap on roots --- ccminer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ccminer.cpp b/ccminer.cpp index 7f01a80..00fe1cd 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -1653,7 +1653,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) work->data[17] = le32dec(sctx->job.ntime); work->data[18] = le32dec(sctx->job.nbits); for (i = 0; i < 16; i++) - work->data[20 + i] = be32dec((uint32_t*)sctx->job.extra + i); + work->data[20 + i] = ((uint32_t*)sctx->job.extra)[i]; } else if (opt_algo == ALGO_SIA) { uint32_t extra = 0; memcpy(&extra, &sctx->job.coinbase[32], 2); From 77c4b8724ea0d0d53c00f27beba66c603d9ff4d4 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 23 Jun 2018 13:29:22 +0200 Subject: [PATCH 13/24] handle new cryptonight variants, stellite, aeon special thanks for klausT changes and ystarnaud who helped me to adapt my kernel variants... Signed-off-by: Tanguy Pruvot --- algos.h | 31 +++++ ccminer.cpp | 41 ++++++- crypto/cn_aes.cuh | 1 + crypto/cn_blake.cuh | 2 +- crypto/cn_groestl.cuh | 9 +- crypto/cn_jh.cuh | 20 +-- crypto/cn_keccak.cuh | 2 +- crypto/cn_skein.cuh | 22 ++-- crypto/cryptolight-core.cu | 86 +++++++++++-- crypto/cryptolight-cpu.cpp | 39 ++++-- crypto/cryptolight.cu | 30 +++-- crypto/cryptolight.h | 13 +- crypto/cryptonight-core.cu | 236 ++++++++++++++++++++++++++++-------- crypto/cryptonight-cpu.cpp | 62 ++++++++-- crypto/cryptonight-extra.cu | 175 +++++++++++--------------- crypto/cryptonight.cu | 36 +++--- crypto/cryptonight.h | 13 +- crypto/xmr-rpc.cpp | 10 +- miner.h | 14 ++- util.cpp | 10 +- 20 files changed, 593 insertions(+), 259 deletions(-) diff --git a/algos.h b/algos.h index 229d8e9..c484bcc 100644 --- a/algos.h +++ b/algos.h @@ -72,6 +72,9 @@ enum sha_algos { ALGO_WHIRLPOOLX, ALGO_WILDKECCAK, ALGO_ZR5, + ALGO_MONERO, + ALGO_GRAFT, + ALGO_STELLITE, ALGO_AUTO, ALGO_COUNT }; @@ -146,6 +149,9 @@ static const char *algo_names[] = { "whirlpoolx", "wildkeccak", "zr5", + "monero", + "graft", + "stellite", "auto", /* reserved for multi algo */ "" }; @@ -206,4 +212,29 @@ static inline int algo_to_int(char* arg) return i; } +static inline int get_cryptonight_algo(int fork) +{ + int algo = ALGO_COUNT; + + switch (fork) { + case 8: + algo = ALGO_GRAFT; + break; + + case 7: + algo = ALGO_MONERO; + break; + + case 3: + algo = ALGO_STELLITE; + break; + + default: + algo = ALGO_CRYPTONIGHT; + break; + } + + return algo; +} + #endif diff --git a/ccminer.cpp b/ccminer.cpp index 00fe1cd..6521284 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -233,6 +233,8 @@ int opt_api_mcast_port = 4068; bool opt_stratum_stats = false; +int cryptonight_fork = 1; + static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ @@ -245,7 +247,7 @@ Options:\n\ blakecoin Fast Blake 256 (8 rounds)\n\ bmw BMW 256\n\ cryptolight AEON cryptonight (MEM/2)\n\ - cryptonight XMR cryptonight\n\ + cryptonight XMR cryptonight v1 (old)\n\ c11/flax X11 variant\n\ decred Decred Blake256\n\ deep Deepcoin\n\ @@ -253,6 +255,7 @@ Options:\n\ dmd-gr Diamond-Groestl\n\ fresh Freshcoin (shavite 80)\n\ fugue256 Fuguecoin\n\ + graft Cryptonight v8\n\ groestl Groestlcoin\n" #ifdef WITH_HEAVY_ALGO " heavy Heavycoin\n" @@ -267,6 +270,7 @@ Options:\n\ lyra2v2 VertCoin\n\ lyra2z ZeroCoin (3rd impl)\n\ myr-gr Myriad-Groestl\n\ + monero XMR cryptonight (v7)\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ @@ -284,6 +288,7 @@ Options:\n\ skein Skein SHA2 (Skeincoin)\n\ skein2 Double Skein (Woodcoin)\n\ skunk Skein Cube Fugue Streebog\n\ + stellite Cryptonight v3\n\ s3 S3 (1Coin)\n\ timetravel Machinecoin permuted x8\n\ tribus Denarius\n\ @@ -573,7 +578,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work); void get_currentalgo(char* buf, int sz) { - snprintf(buf, sz, "%s", algo_names[opt_algo]); + int algo = opt_algo; + if (algo == ALGO_CRYPTONIGHT) + algo = get_cryptonight_algo(cryptonight_fork); + snprintf(buf, sz, "%s", algo_names[algo]); } void format_hashrate(double hashrate, char *output) @@ -2372,11 +2380,16 @@ static void *miner_thread(void *userdata) rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done); break; case ALGO_CRYPTOLIGHT: - rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done); + rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1); break; case ALGO_CRYPTONIGHT: - rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done); + { + int cn_variant = 0; + if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork) + cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1; + rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant); break; + } case ALGO_DECRED: rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done); break; @@ -3138,6 +3151,26 @@ void parse_arg(int key, char *arg) case ALGO_SCRYPT_JANE: opt_nfactor = 14; break; } } + + // cryptonight variants + switch (opt_algo) { + case ALGO_MONERO: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 7; + break; + case ALGO_GRAFT: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 8; + break; + case ALGO_STELLITE: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 3; + break; + case ALGO_CRYPTONIGHT: + cryptonight_fork = 1; + break; + } + break; case 'b': p = strstr(arg, ":"); diff --git a/crypto/cn_aes.cuh b/crypto/cn_aes.cuh index df419b3..99ad212 100644 --- a/crypto/cn_aes.cuh +++ b/crypto/cn_aes.cuh @@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = { */ #define AS_U32(addr) *((uint32_t*)(addr)) +#define AS_U64(addr) *((uint64_t*)(addr)) #define AS_UINT2(addr) *((uint2*)(addr)) #define AS_UINT4(addr) *((uint4*)(addr)) #define AS_UL2(addr) *((ulonglong2*)(addr)) diff --git a/crypto/cn_blake.cuh b/crypto/cn_blake.cuh index 5c0d09f..bd2ba43 100644 --- a/crypto/cn_blake.cuh +++ b/crypto/cn_blake.cuh @@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest) } __device__ -void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out) +void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out) { blake_state bs; blake_state *S = (blake_state *)&bs; diff --git a/crypto/cn_groestl.cuh b/crypto/cn_groestl.cuh index 62530d4..425e062 100644 --- a/crypto/cn_groestl.cuh +++ b/crypto/cn_groestl.cuh @@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) { output[j] = s[i]; } - +#if 0 for (i = 0; i < GROESTL_COLS512; i++) { ctx->chaining[i] = 0; } for (i = 0; i < GROESTL_SIZE512; i++) { ctx->buffer[i] = 0; } +#endif } __device__ @@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx) } __device__ -void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval) { DataLength databitlen = len << 3; groestlHashState context; cn_groestl_init(&context); - cn_groestl_update(&context, data, databitlen); - cn_groestl_final(&context, hashval); + cn_groestl_update(&context, (BitSequence*) data, databitlen); + cn_groestl_final(&context, (BitSequence*) hashval); } diff --git a/crypto/cn_jh.cuh b/crypto/cn_jh.cuh index c2df763..b05380d 100644 --- a/crypto/cn_jh.cuh +++ b/crypto/cn_jh.cuh @@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ databitlen = 0; } - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) { - memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ; + if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) + { + memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ); index = 64-(state->datasize_in_buffer >> 3); databitlen = databitlen - (512 - state->datasize_in_buffer); cn_jh_F8(state); @@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ /* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */ __device__ -void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval) +void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval) { unsigned int i; //uint32_t *bufptr = (uint32_t *)state->buffer; @@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv } else { - /*set the rest of the bytes in the buffer to 0*/ + /* set the rest of the bytes in the buffer to 0 */ if ( (state->datasize_in_buffer & 7) == 0) { for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; } else { @@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv cn_jh_F8(state); } - MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8); + memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32); + //MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8); } __device__ @@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen) state->databitlen = 0; state->datasize_in_buffer = 0; state->hashbitlen = hashbitlen; - //memcpy(state->x, d_JH256_H0, 128); - MEMCPY8(state->x, d_JH256_H0, 128 / 8); + memcpy(state->x, d_JH256_H0, 128); + //MEMCPY8(state->x, d_JH256_H0, 128 / 8); } __device__ -void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval) +void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval) { const int hashbitlen = 256; DataLength databitlen = len << 3; @@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re cn_jh_init(&state, hashbitlen); cn_jh_update(&state, data, databitlen); - cn_jh_final(&state, hashval); + cn_jh_final(&state, (uint8_t*) hashval); } diff --git a/crypto/cn_keccak.cuh b/crypto/cn_keccak.cuh index 3acef7a..c6f5908 100644 --- a/crypto/cn_keccak.cuh +++ b/crypto/cn_keccak.cuh @@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s) } __device__ __forceinline__ -void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md) +void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md) { uint64_t st[25]; diff --git a/crypto/cn_skein.cuh b/crypto/cn_skein.cuh index 2096467..0e68143 100644 --- a/crypto/cn_skein.cuh +++ b/crypto/cn_skein.cuh @@ -4,19 +4,15 @@ typedef unsigned int uint_t; /* native unsigned integer */ #define SKEIN_256_STATE_WORDS ( 4) #define SKEIN_512_STATE_WORDS ( 8) -#define SKEIN1024_STATE_WORDS (16) #define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) #define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) #define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) #define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) #define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) #define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) #define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) @@ -119,7 +115,7 @@ typedef struct { } skeinHashState; __device__ -void cn_skein256_init(skeinHashState *state, size_t hashBitLen) +void cn_skein_init(skeinHashState *state, size_t hashBitLen) { const uint64_t SKEIN_512_IV_256[] = { @@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr } __device__ -void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen) +void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen) { if ((databitlen & 7) == 0) { - cn_skein_block(&state->u.ctx_512, data, databitlen >> 3); } else { - size_t bCnt = (databitlen >> 3) + 1; uint8_t b,mask; @@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r } __device__ -void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal) +void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal) { uint64_t X[SKEIN_512_STATE_WORDS]; Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; @@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric ((uint64_t *)ctx->b)[0] = (uint64_t)i; Skein_Start_New_Type(ctx, OUT_FINAL); cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t)); - memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n); + memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n); memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time } } __device__ -void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval) +void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval) { int hashbitlen = 256; DataLength databitlen = len << 3; @@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re state.statebits = 64*SKEIN_512_STATE_WORDS; - cn_skein256_init(&state, hashbitlen); - cn_skein256_update(&state, data, databitlen); - cn_skein256_final(&state, hashval); + cn_skein_init(&state, hashbitlen); + cn_skein_update(&state, data, databitlen); + cn_skein_final(&state, (uint8_t*) hashval); } diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu index 3891768..8f0bb75 100644 --- a/crypto/cryptolight-core.cu +++ b/crypto/cryptolight-core.cu @@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * if(thread < threads) { - const int oft = thread * 52 + sub + 16; // not aligned 16! + const int oft = thread * 50 + sub + 16; // not aligned 16! const int long_oft = (thread << LONG_SHL_IDX) + sub; uint32_t __align__(16) key[40]; uint32_t __align__(16) text[4]; @@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * } } +// -------------------------------------------------------------------------------------------------------------- + __global__ -void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) +void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) { __shared__ uint32_t __align__(16) sharedMemory[1024]; @@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int #endif // __CUDA_ARCH__ >= 300 } +__device__ __forceinline__ void store_variant1(uint32_t* long_state) +{ + uint4* Z = (uint4*) long_state; + const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773); + const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1; + Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24); +} + +#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \ + uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \ + hi += ((uint64_t *)c)[0]; \ + ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \ + ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \ + ((uint64_t *)dst)[0] = hi; \ + ((uint64_t *)dst)[1] = lo ^ tweak; } + +__global__ +void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint64_t * __restrict__ d_tweak) +{ + __shared__ __align__(16) uint32_t sharedMemory[1024]; + cn_aes_gpu_init(sharedMemory); + __syncthreads(); + + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + const uint32_t batchsize = ITER >> (2 + bfactor); + const uint32_t start = partidx * batchsize; + const uint32_t end = start + batchsize; + const uint32_t longptr = thread << LONG_SHL_IDX; + uint32_t * long_state = &d_long_state[longptr]; + uint64_t tweak = d_tweak[thread]; + + void * ctx_a = (void*)(&d_ctx_a[thread << 2]); + void * ctx_b = (void*)(&d_ctx_b[thread << 2]); + uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4 + uint4 B = AS_UINT4(ctx_b); + uint32_t* a = (uint32_t*)&A; + uint32_t* b = (uint32_t*)&B; + + for (int i = start; i < end; i++) + { + uint32_t c[4]; + uint32_t j = (A.x >> 2) & E2I_MASK2; + cn_aes_single_round(sharedMemory, &long_state[j], c, a); + XOR_BLOCKS_DST(c, b, &long_state[j]); + store_variant1(&long_state[j]); + MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak); + + j = (A.x >> 2) & E2I_MASK2; + cn_aes_single_round(sharedMemory, &long_state[j], b, a); + XOR_BLOCKS_DST(b, c, &long_state[j]); + store_variant1(&long_state[j]); + MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak); + } + if (bfactor) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; + } + } +} + __global__ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2) { @@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3 if(thread < threads) { const int long_oft = (thread << LONG_SHL_IDX) + sub; - const int oft = thread * 52 + sub + 16; + const int oft = thread * 50 + sub + 16; uint32_t __align__(16) key[40]; uint32_t __align__(16) text[4]; @@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3 extern int device_bfactor[MAX_GPUS]; __host__ -void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, - uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) +void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, + uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak) { dim3 grid(blocks); dim3 block(threads); @@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_ int i, partcount = 1 << bfactor; int dev_id = device_map[thr_id]; - cryptolight_core_gpu_phase1 <<>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1); + cryptolight_core_gpu_phase1 <<>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); for(i = 0; i < partcount; i++) { - cryptolight_core_gpu_phase2 <<= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + dim3 b = device_sm[dev_id] >= 300 ? block4 : block; + if (variant == 0) + cryptolight_old_gpu_phase2 <<>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + else + cryptolight_gpu_phase2 <<>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); } - cryptolight_core_gpu_phase3 <<>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2); + cryptolight_core_gpu_phase3 <<>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp index b0ee386..f995b4c 100644 --- a/crypto/cryptolight-cpu.cpp +++ b/crypto/cryptolight-cpu.cpp @@ -22,6 +22,16 @@ struct cryptonight_ctx { oaes_ctx* aes_ctx; }; + +static void cryptolight_store_variant(void* state, int variant) { + if (variant == 1) { + // use variant 1 like monero since june 2018 + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); + } +} + static void do_blake_hash(const void* input, int len, void* output) { uchar hash[32]; @@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui ((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; } -static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) { +static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) { uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; hi += ((uint64_t*) c)[0]; ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) dst)[0] = hi; - ((uint64_t*) dst)[1] = lo; + ((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo; } static void copy_block(uint8_t* dst, const uint8_t* src) { @@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; } -static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx) +static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant) { size_t i, j; + if (variant && len < 43) + return 0; + keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); + const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0; + oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { #undef RND @@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, j = e2i(ctx->a); aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); + cryptolight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]); + mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak); j = e2i(ctx->a); aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); + cryptolight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]); + mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak); } memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); @@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); oaes_free((OAES_CTX **) &ctx->aes_ctx); + return 1; } -void cryptolight_hash(void* output, const void* input, int len) +int cryptolight_hash_variant(void* output, const void* input, int len, int variant) { struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); - cryptolight_hash_ctx(output, input, len, ctx); + int rc = cryptolight_hash_ctx(output, input, len, ctx, variant); free(ctx); + return rc; } + +void cryptolight_hash(void* output, const void* input) +{ + cryptolight_hash_variant(output, input, 76, 1); +} + diff --git a/crypto/cryptolight.cu b/crypto/cryptolight.cu index c8ab8ea..c2a10e4 100644 --- a/crypto/cryptolight.cu +++ b/crypto/cryptolight.cu @@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks = 32; static __thread uint32_t cn_threads = 16; static uint32_t *d_long_state[MAX_GPUS]; -static uint64_t *d_ctx_state[MAX_GPUS]; +static uint32_t *d_ctx_state[MAX_GPUS]; static uint32_t *d_ctx_key1[MAX_GPUS]; static uint32_t *d_ctx_key2[MAX_GPUS]; static uint32_t *d_ctx_text[MAX_GPUS]; +static uint64_t *d_ctx_tweak[MAX_GPUS]; static uint32_t *d_ctx_a[MAX_GPUS]; static uint32_t *d_ctx_b[MAX_GPUS]; static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant) { int res = 0; uint32_t throughput = 0; @@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ uint32_t *nonceptr = (uint32_t*) (&pdata[39]); const uint32_t first_nonce = *nonceptr; uint32_t nonce = first_nonce; + int dev_id = device_map[thr_id]; if(opt_benchmark) { ptarget[7] = 0x00ff; @@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ if(!init[thr_id]) { + if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) { + device_config[thr_id] = strdup("80x32"); + } + if (device_config[thr_id]) { sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); @@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ } const size_t alloc = MEMORY * throughput; - cryptonight_extra_cpu_init(thr_id, throughput); + cryptonight_extra_init(thr_id); cudaMalloc(&d_long_state[thr_id], alloc); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput); + cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); @@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput); init[thr_id] = true; } @@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ const uint32_t Htarg = ptarget[7]; uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; - cryptonight_extra_cpu_setData(thr_id, pdata, ptarget); - cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); + cryptonight_extra_setData(thr_id, pdata, ptarget); + cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); *hashes_done = nonce - first_nonce + throughput; @@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); memcpy(tempdata, pdata, 76); *tempnonceptr = resNonces[0]; - cryptolight_hash(vhash, tempdata, 76); + cryptolight_hash_variant(vhash, tempdata, 76, variant); if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { res = 1; @@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ if(resNonces[1] != UINT32_MAX) { *tempnonceptr = resNonces[1]; - cryptolight_hash(vhash, tempdata, 76); + cryptolight_hash_variant(vhash, tempdata, 76, variant); if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { res++; work->nonces[1] = resNonces[1]; @@ -157,10 +164,11 @@ void free_cryptolight(int thr_id) cudaFree(d_ctx_key1[thr_id]); cudaFree(d_ctx_key2[thr_id]); cudaFree(d_ctx_text[thr_id]); + cudaFree(d_ctx_tweak[thr_id]); cudaFree(d_ctx_a[thr_id]); cudaFree(d_ctx_b[thr_id]); - cryptonight_extra_cpu_free(thr_id); + cryptonight_extra_free(thr_id); cudaDeviceSynchronize(); diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h index 443cf5b..482d0f8 100644 --- a/crypto/cryptolight.h +++ b/crypto/cryptolight.h @@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) exit(1); } } -void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); -void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); -void cryptonight_extra_cpu_init(int thr_id, uint32_t threads); -void cryptonight_extra_cpu_free(int thr_id); -void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); -void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state); +void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); + +void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget); +void cryptonight_extra_init(int thr_id/*, uint32_t threads*/); +void cryptonight_extra_free(int thr_id); +void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); +void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state); diff --git a/crypto/cryptonight-core.cu b/crypto/cryptonight-core.cu index 4780f37..90f024f 100644 --- a/crypto/cryptonight-core.cu +++ b/crypto/cryptonight-core.cu @@ -2,47 +2,55 @@ #include #include #include +#ifndef _WIN32 #include +#endif + +#include +#include + +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#undef __shfl +#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width) +#endif #include "cryptonight.h" -#define LONG_SHL32 19 // 1<<19 +#define LONG_SHL32 19 // 1<<19 (uint32_t* index) #define LONG_SHL64 18 // 1<<18 (uint64_t* index) #define LONG_LOOPS32 0x80000U -#define LONG_LOOPS64 0x40000U #include "cn_aes.cuh" __global__ -//__launch_bounds__(128, 9) // 56 registers -void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1) +void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state, + uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1) { - __shared__ __align__(16) uint32_t sharedMemory[1024]; - cn_aes_gpu_init(sharedMemory); - __syncthreads(); + __shared__ uint32_t sharedMemory[1024]; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; - const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14 - if(thread < threads) { - const uint32_t long_oft = (thread << LONG_SHL64) + sub; - - const uint32_t* ctx_key = &ctx_key1[thread * 40U]; - uint4 keys[10]; - #pragma unroll 10 // load 160 bytes - for (int i = 0; i < 10; i ++) - keys[i] = AS_UINT4(&ctx_key[i*4]); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); - uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]); + const uint32_t sub = (threadIdx.x & 0x7U) << 2; + uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub]; + uint32_t __align__(8) key[40]; + MEMCPY8(key, &ctx_key1[thread * 40U], 20); + uint32_t __align__(8) text[4]; + MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2); - for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) { - cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys); - AS_UINT4(&long_state[long_oft + i]) = text; + for(int i = 0; i < LONG_LOOPS32; i += 32) + { + cn_aes_pseudo_round_mut(sharedMemory, text, key); + MEMCPY8(&longstate[i], text, 2); } } } +// -------------------------------------------------------------------------------------------------------------- + __device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand) { ulonglong2 product; @@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); } -#undef MUL_SUM_XOR_DST -__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst) +__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst) { ulonglong2 d = AS_UL2(far_dst); ulonglong2 p = cuda_mul128(m, d.x); @@ -73,8 +80,8 @@ __global__ #if __CUDA_ARCH__ >= 500 //__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */ #endif -void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx, - uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) +void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b) { __shared__ __align__(16) uint32_t sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); @@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, if (thread < threads) { - const uint32_t batchsize = ITER >> (2U + bfactor); + const uint32_t batchsize = ITER >> (2 + bfactor); const uint32_t start = partidx * batchsize; const uint32_t end = start + batchsize; @@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, uint32_t j = (A.x & E2I_MASK) >> 3; cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C); AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4 - MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]); + MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]); j = (A.x & E2I_MASK) >> 3; cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B); AS_UINT4(&long_state[j]) = C ^ B; - MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]); + MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]); } if (bfactor) { @@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, } } +// -------------------------------------------------------------------------------------------------------------- + +__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z) +{ + const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773); + const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1; + Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24); + AS_UINT4(long_state) = Z; +} + +__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z) +{ + const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773); + const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1; + Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24); + AS_UINT4(long_state) = Z; +} + +__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak) +{ + ulonglong2 d = AS_UL2(far_dst); + ulonglong2 p = cuda_mul128(m, d.x); + p += AS_UL2(&a); + AS_UL2(&a) = p ^ d; + p.y = p.y ^ tweak; + AS_UL2(far_dst) = p; +} + __global__ -void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2) +void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint64_t * __restrict__ d_tweak) { __shared__ __align__(16) uint32_t sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); __syncthreads(); - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U; - const uint32_t sub = (threadIdx.x & 7U) << 1U; + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + const uint32_t batchsize = ITER >> (2 + bfactor); + const uint32_t start = partidx * batchsize; + const uint32_t end = start + batchsize; + uint64_t tweak = d_tweak[thread]; + + void * ctx_a = (void*)(&d_ctx_a[thread << 2]); + void * ctx_b = (void*)(&d_ctx_b[thread << 2]); + uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4 + uint4 B = AS_UINT4(ctx_b); - if(thread < threads) + uint64_t * long_state = &d_long_state[thread << LONG_SHL64]; + for (int i = start; i < end; i++) // end = 262144 + { + uint4 C; + uint32_t j = (A.x & E2I_MASK) >> 3; + cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C); + store_variant1(&long_state[j], C ^ B); // st.global + MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak); + + j = (A.x & E2I_MASK) >> 3; + cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B); + store_variant1(&long_state[j], C ^ B); + MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak); + } + if (bfactor) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; + } + } +} + +// -------------------------------------------------------------------------------------------------------------- + +__global__ +void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint64_t * __restrict__ d_tweak) +{ + __shared__ __align__(16) uint32_t sharedMemory[1024]; + cn_aes_gpu_init(sharedMemory); + __syncthreads(); + + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) { - const uint32_t long_oft = (thread << LONG_SHL64) + sub; - const uint32_t st_oft = (thread * 26U) + sub + 8U; + const uint32_t batchsize = ITER >> (2 + bfactor); + const uint32_t start = partidx * batchsize; + const uint32_t end = start + batchsize; + uint64_t tweak = d_tweak[thread]; + + void * ctx_a = (void*)(&d_ctx_a[thread << 2]); + void * ctx_b = (void*)(&d_ctx_b[thread << 2]); + uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4 + uint4 B = AS_UINT4(ctx_b); + + uint64_t * long_state = &d_long_state[thread << LONG_SHL64]; + for (int i = start; i < end; i++) // end = 262144 + { + uint4 C; + uint32_t j = (A.x & E2I_MASK) >> 3; + cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C); + store_variant2(&long_state[j], C ^ B); // st.global + MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak); + + j = (A.x & E2I_MASK) >> 3; + cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B); + store_variant2(&long_state[j], C ^ B); + MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak); + } + if (bfactor) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; + } + } +} + +// -------------------------------------------------------------------------------------------------------------- - uint4 key[10]; - const uint32_t* ctx_key = &ctx_key2[thread * 40U]; - #pragma unroll 10 // 160 bytes - for (int i = 0; i < 10; i++) - key[i] = AS_UINT4(&ctx_key[i*4U]); +__global__ +void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state, + uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init(sharedMemory); + __syncthreads(); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; - uint4 text = AS_UINT4(&ctx_state[st_oft]); + if(thread < threads) + { + const int sub = (threadIdx.x & 7) << 2; + const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub]; + uint32_t key[40], text[4]; + MEMCPY8(key, d_ctx_key2 + thread * 40, 20); + MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2); - for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U) + for(int i = 0; i < LONG_LOOPS32; i += 32) { - uint4 st = AS_UINT4(&long_state[long_oft + i]); - text = text ^ st; - cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key); + #pragma unroll + for(int j = 0; j < 4; ++j) + text[j] ^= longstate[i + j]; + + cn_aes_pseudo_round_mut(sharedMemory, text, key); } - AS_UINT4(&ctx_state[st_oft]) = text; + MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2); } } +// -------------------------------------------------------------------------------------------------------------- + extern int device_bfactor[MAX_GPUS]; __host__ -void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, - uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) +void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, + uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak) { dim3 grid(blocks); dim3 block(threads); - //dim3 block2(threads << 1); dim3 block4(threads << 2); dim3 block8(threads << 3); - const uint32_t bfactor = (uint32_t) device_bfactor[thr_id]; - const uint32_t partcount = 1 << bfactor; + const uint16_t bfactor = (uint16_t) device_bfactor[thr_id]; + const uint32_t partcount = 1U << bfactor; const uint32_t throughput = (uint32_t) (blocks*threads); const int bsleep = bfactor ? 100 : 0; const int dev_id = device_map[thr_id]; - cryptonight_core_gpu_phase1 <<>> (throughput, d_long_state, d_ctx_state, d_ctx_key1); + cryptonight_gpu_phase1 <<>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); for (uint32_t i = 0; i < partcount; i++) { dim3 b = device_sm[dev_id] >= 300 ? block4 : block; - cryptonight_core_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + if (variant == 0) + cryptonight_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + else if (variant == 1 || cryptonight_fork == 8) + monero_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak); + else if (variant == 2 && cryptonight_fork == 3) + stellite_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); } - - cryptonight_core_gpu_phase3 <<>> (throughput, d_long_state, d_ctx_state, d_ctx_key2); + //cudaDeviceSynchronize(); + //exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cryptonight_gpu_phase3 <<>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp index 66b3cf4..b60798f 100644 --- a/crypto/cryptonight-cpu.cpp +++ b/crypto/cryptonight-cpu.cpp @@ -12,6 +12,20 @@ extern "C" { #include "cpu/c_keccak.h" } +static void cryptonight_store_variant(void* state, int variant) { + if (variant == 1 || cryptonight_fork == 8) { + // monero, and graft ? + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); + } else if (variant == 2 && cryptonight_fork == 3) { + // stellite + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30); + } +} + struct cryptonight_ctx { uint8_t long_state[MEMORY]; union cn_slow_hash_state state; @@ -130,14 +144,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui ((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; } -static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) { +static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) { uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; hi += ((uint64_t*) c)[0]; ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) dst)[0] = hi; - ((uint64_t*) dst)[1] = lo; + ((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo; } static void copy_block(uint8_t* dst, const uint8_t* src) { @@ -155,13 +169,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; } -static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx) +static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant) { size_t i, j; + if (variant && len < 43) + return 0; + keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); + const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0; + oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { #undef RND @@ -184,14 +203,16 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st j = e2i(ctx->a) * AES_BLOCK_SIZE; aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); + cryptonight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]); + mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2); j = e2i(ctx->a) * AES_BLOCK_SIZE; aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); + cryptonight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]); + mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2); } memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); @@ -217,11 +238,38 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); oaes_free((OAES_CTX **) &ctx->aes_ctx); + return 1; } -void cryptonight_hash(void* output, const void* input, size_t len) +int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant) { struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); - cryptonight_hash_ctx(output, input, len, ctx); + int rc = cryptonight_hash_ctx(output, input, len, ctx, variant); free(ctx); + return rc; +} + +void cryptonight_hash(void* output, const void* input) +{ + cryptonight_fork = 1; + cryptonight_hash_variant(output, input, 76, 0); +} + +void graft_hash(void* output, const void* input) +{ + cryptonight_fork = 8; + cryptonight_hash_variant(output, input, 76, 1); +} + +void monero_hash(void* output, const void* input) +{ + cryptonight_fork = 7; + cryptonight_hash_variant(output, input, 76, 1); } + +void stellite_hash(void* output, const void* input) +{ + cryptonight_fork = 3; + cryptonight_hash_variant(output, input, 76, 2); +} + diff --git a/crypto/cryptonight-extra.cu b/crypto/cryptonight-extra.cu index 6d3c131..c55c518 100644 --- a/crypto/cryptonight-extra.cu +++ b/crypto/cryptonight-extra.cu @@ -7,15 +7,15 @@ #include #include -#include "cryptonight.h" -typedef uint8_t BitSequence; -typedef uint64_t DataLength; +#include "cryptonight.h" -static uint32_t *d_input[MAX_GPUS] = { 0 }; +static uint32_t *d_input[MAX_GPUS]; static uint32_t *d_target[MAX_GPUS]; static uint32_t *d_result[MAX_GPUS]; +typedef uint8_t BitSequence; +typedef uint32_t DataLength; #include "cn_keccak.cuh" #include "cn_blake.cuh" #include "cn_groestl.cuh" @@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = { __device__ __forceinline__ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data) { - const uint32_t aes_gf[] = { + const uint32_t aes_gf[10] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; - MEMSET4(key, 0, 40); MEMCPY4(key, data, 8); - #pragma unroll for(int i = 8; i < 40; i++) { @@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res } __global__ -void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce, - uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, - uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2) +void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce, + uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if(thread < threads) { - uint32_t ctx_state[50]; + uint64_t ctx_state[25]; uint32_t ctx_a[4]; uint32_t ctx_b[4]; uint32_t ctx_key1[40]; @@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict uint32_t input[19]; MEMCPY4(input, d_input, 19); - *((uint32_t *)(((char *)input) + 39)) = startNonce + thread; - - cn_keccak((uint8_t *)input, (uint8_t *)ctx_state); - cryptonight_aes_set_key(ctx_key1, ctx_state); - cryptonight_aes_set_key(ctx_key2, ctx_state + 8); - XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a); - XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b); - - MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25); - MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4); - MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4); - MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40); - MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40); - } -} -__global__ -void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if(thread < threads) - { - uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]); - uint64_t state[25]; - #pragma unroll - for(int i = 0; i < 25; i++) - state[i] = ctx_state[i]; - - cn_keccakf2(state); - - // to reduce the final kernel stack frame, cut algos in 2 kernels - // ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%) - switch (((uint8_t*)state)[0] & 0x03) - { - case 0: { - uint32_t hash[8]; - cn_blake((uint8_t*)state, 200, (uint8_t*)hash); - ((uint32_t*)ctx_state)[0] = 0; - ((uint32_t*)ctx_state)[6] = hash[6]; - ((uint32_t*)ctx_state)[7] = hash[7]; - break; - } - case 1: { - uint32_t hash[8]; - cn_groestl((BitSequence*)state, 200, (BitSequence*)hash); - ((uint32_t*)ctx_state)[0] = 0; - ((uint32_t*)ctx_state)[6] = hash[6]; - ((uint32_t*)ctx_state)[7] = hash[7]; - break; - } - default: { - #pragma unroll - for(int i = 0; i < 25; i++) - ctx_state[i] = state[i]; - } + uint32_t nonce = startNonce + thread; + *(((uint8_t *)input) + 39) = nonce & 0xff; + *(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff; + *(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff; + *(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff; + + cn_keccak(input, ctx_state); + MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50); + + cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0])); + cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4])); + MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40); + MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40); + + XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a); + XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b); + MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4); + MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4); + + if (variant) { + uint2 tweak = AS_UINT2(&ctx_state[24]); + //tweak.x ^= (input[8] >> 24) | (input[9] << 8); + tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543); + tweak.y ^= __byte_perm(input[9], input[10], 0x6543); + MEMCPY4(&d_ctx_tweak[thread], &tweak, 2); } } } __global__ -void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state, - const uint32_t* d_target, uint32_t * resNonces) +void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target, + uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state) { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; if(thread < threads) { - uint64_t* const state = &d_ctx_state[thread * 26U]; - + uint32_t *ctx_state = &d_ctx_state[thread * 50U]; uint32_t hash[8]; - switch(((uint8_t *)state)[0] & 0x03) - { - case 0: { - uint32_t* h32 = (uint32_t*)state; - hash[6] = h32[6]; - hash[7] = h32[7]; - break; - } - case 2: { - cn_jh256((uint8_t*)state, 200, hash); - break; - } - case 3: { - cn_skein((uint8_t*)state, 200, hash); - break; - } - } + uint32_t state[50]; + + #pragma unroll 25 + for(int i = 0; i < 50; i+=2) + AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]); + + cn_keccakf2((uint64_t *)state); + + int branch = ((uint8_t *)state)[0] & 0x03; + if(branch == 0) + cn_blake((const uint8_t *)state, 200, hash); + if(branch == 1) + cn_groestl((const uint8_t *)state, 200, hash); + if(branch == 2) + cn_jh((const uint8_t *)state, 200, hash); + if(branch == 3) + cn_skein((const uint8_t *)state, 200, hash); if(hash[7] <= d_target[1] && hash[6] <= d_target[0]) { @@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui } __host__ -void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget) +void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget) { uint32_t *pTargetIn = (uint32_t*) ptarget; - cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t)); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } __host__ -void cryptonight_extra_cpu_init(int thr_id, uint32_t threads) +void cryptonight_extra_init(int thr_id) { - cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t)); - cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t)); - cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t)); + cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t)); + cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t)); + cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t)); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } __host__ -void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) +void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak) { uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cryptonight_extra_gpu_prepare <<>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2); + cryptonight_extra_gpu_prepare <<>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } __host__ -void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state) +void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state) { uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t)); - exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cryptonight_extra_gpu_keccak <<>> (threads, (uint32_t*)d_ctx_state); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cryptonight_extra_gpu_final <<>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]); + cryptonight_extra_gpu_final <<>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } __host__ -void cryptonight_extra_cpu_free(int thr_id) +void cryptonight_extra_free(int thr_id) { if (d_input[thr_id]) { cudaFree(d_input[thr_id]); @@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id) cudaFree(d_result[thr_id]); d_input[thr_id] = NULL; } -} \ No newline at end of file +} diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index 0214ce4..5f92972 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false; gpulog(p, thr, fmt, ##__VA_ARGS__) static uint64_t *d_long_state[MAX_GPUS]; -static uint64_t *d_ctx_state[MAX_GPUS]; +static uint32_t *d_ctx_state[MAX_GPUS]; static uint32_t *d_ctx_key1[MAX_GPUS]; static uint32_t *d_ctx_key2[MAX_GPUS]; static uint32_t *d_ctx_text[MAX_GPUS]; +static uint64_t *d_ctx_tweak[MAX_GPUS]; static uint32_t *d_ctx_a[MAX_GPUS]; static uint32_t *d_ctx_b[MAX_GPUS]; static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant) { int res = 0; uint32_t throughput = 0; @@ -49,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id], mem, device_mpcount[dev_id]); + if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) { + device_config[thr_id] = strdup("80x24"); + } + if (device_config[thr_id]) { int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); @@ -70,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ exit(1); } - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); @@ -79,11 +84,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ } const size_t alloc = MEMORY * throughput; - cryptonight_extra_cpu_init(thr_id, throughput); + cryptonight_extra_init(thr_id); cudaMalloc(&d_long_state[thr_id], alloc); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16) + cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); @@ -95,6 +100,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput); + exit_if_cudaerror(thr_id, __FILE__, __LINE__); gpu_init_shown = true; init[thr_id] = true; @@ -107,10 +114,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ const uint32_t Htarg = ptarget[7]; uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; - cryptonight_extra_cpu_setData(thr_id, pdata, ptarget); - cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); + cryptonight_extra_setData(thr_id, pdata, ptarget); + cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); *hashes_done = nonce - first_nonce + throughput; @@ -121,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); memcpy(tempdata, pdata, 76); *tempnonceptr = resNonces[0]; - cryptonight_hash(vhash, tempdata, 76); - if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) + const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant); + if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) { res = 1; work->nonces[0] = resNonces[0]; @@ -131,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ if(resNonces[1] != UINT32_MAX) { *tempnonceptr = resNonces[1]; - cryptonight_hash(vhash, tempdata, 76); - if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant); + if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) { res++; work->nonces[1] = resNonces[1]; } else { @@ -174,10 +181,11 @@ void free_cryptonight(int thr_id) cudaFree(d_ctx_key1[thr_id]); cudaFree(d_ctx_key2[thr_id]); cudaFree(d_ctx_text[thr_id]); + cudaFree(d_ctx_tweak[thr_id]); cudaFree(d_ctx_a[thr_id]); cudaFree(d_ctx_b[thr_id]); - cryptonight_extra_cpu_free(thr_id); + cryptonight_extra_free(thr_id); cudaDeviceSynchronize(); diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index 4a31832..00417b9 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -20,7 +20,6 @@ struct uint3 blockDim; #define __umul64hi(a,b) a*b #endif - #define MEMORY (1U << 21) // 2 MiB / 2097152 B #define ITER (1U << 20) // 1048576 #define E2I_MASK 0x1FFFF0u @@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) exit(1); } } -void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); +void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); -void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); -void cryptonight_extra_cpu_init(int thr_id, uint32_t threads); -void cryptonight_extra_cpu_free(int thr_id); -void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); -void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state); +void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget); +void cryptonight_extra_init(int thr_id); +void cryptonight_extra_free(int thr_id); +void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); +void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state); diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp index 82b7845..433caa7 100644 --- a/crypto/xmr-rpc.cpp +++ b/crypto/xmr-rpc.cpp @@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work) } else if (opt_algo == ALGO_CRYPTOLIGHT) { + int variant = 1; uint32_t nonce = work->nonces[idnonce]; noncestr = bin2hex((unsigned char*) &nonce, 4); last_found_nonce = nonce; - cryptolight_hash(hash, data, 76); + //if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork) + // variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1; + cryptolight_hash_variant(hash, data, 76, variant); work_set_target_ratio(work, (uint32_t*) hash); } else if (opt_algo == ALGO_CRYPTONIGHT) { + int variant = 0; uint32_t nonce = work->nonces[idnonce]; noncestr = bin2hex((unsigned char*) &nonce, 4); last_found_nonce = nonce; - cryptonight_hash(hash, data, 76); + if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork) + variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1; + cryptonight_hash_variant(hash, data, 76, variant); work_set_target_ratio(work, (uint32_t*) hash); } diff --git a/miner.h b/miner.h index 2853906..86088cb 100644 --- a/miner.h +++ b/miner.h @@ -279,8 +279,8 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant); +extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant); extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -575,6 +575,8 @@ extern uint32_t device_plimit[MAX_GPUS]; extern uint32_t gpus_intensity[MAX_GPUS]; extern int opt_cudaschedule; +extern int cryptonight_fork; + // cuda.cpp int cuda_num_devices(); void cuda_devicenames(); @@ -898,8 +900,12 @@ void blake2b_hash(void *output, const void *input); void blake2s_hash(void *output, const void *input); void bmw_hash(void *state, const void *input); void c11hash(void *output, const void *input); -void cryptolight_hash(void* output, const void* input, int len); -void cryptonight_hash(void* output, const void* input, size_t len); +int cryptolight_hash_variant(void* output, const void* input, int len, int variant); +void cryptolight_hash(void* output, const void* input); +int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant); +void cryptonight_hash(void* output, const void* input); +void monero_hash(void* output, const void* input); +void stellite_hash(void* output, const void* input); void decred_hash(void *state, const void *input); void deephash(void *state, const void *input); void luffa_hash(void *state, const void *input); diff --git a/util.cpp b/util.cpp index 49cd854..9c2194d 100644 --- a/util.cpp +++ b/util.cpp @@ -2193,10 +2193,10 @@ void print_hash_tests(void) c11hash(&hash[0], &buf[0]); printpfx("c11", hash); - cryptolight_hash(&hash[0], &buf[0], 76); + cryptolight_hash(&hash[0], &buf[0]); printpfx("cryptolight", hash); - cryptonight_hash(&hash[0], &buf[0], 76); + cryptonight_hash(&hash[0], &buf[0]); printpfx("cryptonight", hash); memset(buf, 0, 180); @@ -2246,6 +2246,9 @@ void print_hash_tests(void) lyra2Z_hash(&hash[0], &buf[0]); printpfx("lyra2z", hash); + monero_hash(&hash[0], &buf[0]); + printpfx("monero", hash); + myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash); @@ -2297,6 +2300,9 @@ void print_hash_tests(void) skunk_hash(&hash[0], &buf[0]); printpfx("skunk", hash); + stellite_hash(&hash[0], &buf[0]); + printpfx("stelitte", hash); + s3hash(&hash[0], &buf[0]); printpfx("S3", hash); From 6dc1bbdd47bbd4b85850ab43540c71875ef95e92 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 23 Jun 2018 13:39:38 +0200 Subject: [PATCH 14/24] prepare the new release --- README.txt | 13 ++++++++++--- compat/ccminer-config.h | 2 +- configure.ac | 2 +- res/ccminer.rc | 8 ++++---- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/README.txt b/README.txt index 0862870..148f089 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccminer 2.2.6 "phi2 and allium" +ccminer 2.3 "phi2 and cryptonight variants" --------------------------------------------------------------- *************************************************************** @@ -80,8 +80,8 @@ its command line interface and options. blakecoin use to mine Old Blake 256 blake2s use to mine Nevacoin (Blake2-S 256) bmw use to mine Midnight - cryptolight use to mine AEON cryptonight (MEM/2) - cryptonight use to mine XMR cryptonight, Bytecoin, Dash, DigitalNote, etc + cryptolight use to mine AEON cryptonight variant 1 (MEM/2) + cryptonight use to mine original cryptonight c11/flax use to mine Chaincoin and Flax decred use to mine Decred 180 bytes Blake256-14 deep use to mine Deepcoin @@ -99,10 +99,12 @@ its command line interface and options. lyra2 use to mine CryptoCoin lyra2v2 use to mine Vertcoin lyra2z use to mine Zerocoin (XZC) + monero use to mine Monero (XMR) myr-gr use to mine Myriad-Groest neoscrypt use to mine FeatherCoin, Trezarcoin, Orbitcoin, etc nist5 use to mine TalkCoin penta use to mine Joincoin / Pentablake + phi1612 use to mine Seraph phi2 use to mine LUXCoin polytimos use to mine Polytimos quark use to mine Quarkcoin @@ -117,6 +119,7 @@ its command line interface and options. skein use to mine Skeincoin skein2 use to mine Woodcoin skunk use to mine Signatum + stellite use to mine Stellite (a cryptonight variant) timetravel use to mine MachineCoin tribus use to mine Denarius x11evo use to mine Revolver @@ -282,6 +285,10 @@ so we can more efficiently implement new algorithms using the latest hardware features. >>> RELEASE HISTORY <<< + June 23th 2018 v2.3 + Handle phi2 header variation for smart contracts + Handle monero, stellite, graft and cryptolight variants + June 10th 2018 v2.2.6 New phi2 algo for LUX New allium algo for Garlic diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index 5b36078..030e89f 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -164,7 +164,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.2.6" +#define PACKAGE_VERSION "2.3" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/configure.ac b/configure.ac index 5489e9c..9030e7e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.2.7], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.3], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/res/ccminer.rc b/res/ccminer.rc index 78be94c..18eb1d2 100644 --- a/res/ccminer.rc +++ b/res/ccminer.rc @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" // VS_VERSION_INFO VERSIONINFO - FILEVERSION 2,2,6,0 - PRODUCTVERSION 2,2,6,0 + FILEVERSION 2,3,0,0 + PRODUCTVERSION 2,3,0,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x21L @@ -76,10 +76,10 @@ BEGIN BEGIN BLOCK "040904e4" BEGIN - VALUE "FileVersion", "2.2.6" + VALUE "FileVersion", "2.3" VALUE "LegalCopyright", "Copyright (C) 2018" VALUE "ProductName", "ccminer" - VALUE "ProductVersion", "2.2.6" + VALUE "ProductVersion", "2.3" END END BLOCK "VarFileInfo" From d9f242b8d1a1ef46e584f69666450fbc4431db15 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 23 Jun 2018 14:40:29 +0200 Subject: [PATCH 15/24] add sonoa algo, heavy x17 hashes seems to works, more or less correctly (a few validation errors) --- Makefile.am | 2 +- README.txt | 10 +- algos.h | 2 + bench.cpp | 2 + ccminer.cpp | 7 +- ccminer.vcxproj | 3 +- ccminer.vcxproj.filters | 3 + miner.h | 2 + x17/sonoa.cu | 632 ++++++++++++++++++++++++++++++++++++++++ 9 files changed, 657 insertions(+), 6 deletions(-) create mode 100644 x17/sonoa.cu diff --git a/Makefile.am b/Makefile.am index 80a80c8..ddfbec6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -80,7 +80,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \ x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \ x16/cuda_x16_echo512_64.cu \ - x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ + x17/x17.cu x17/hmq17.cu x17/sonoa.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu phi/cuda_phi2_cubehash512.cu x11/cuda_streebog_maxwell.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu diff --git a/README.txt b/README.txt index 148f089..cb60fca 100644 --- a/README.txt +++ b/README.txt @@ -41,19 +41,21 @@ Keccak (Maxcoin) Pentablake (Blake 512 x5) 1Coin Triple S Neoscrypt (FeatherCoin) -Revolver (X11evo) +x11evo (Revolver) +phi2 (LUXCoin) Scrypt and Scrypt:N Scrypt-Jane (Chacha) -Sibcoin (sib) +sib (Sibcoin) Skein (Skein + SHA) Signatum (Skein cubehash fugue Streebog) +SonoA (Sono) Tribus (JH, keccak, simd) Woodcoin (Double Skein) Vanilla (Blake256 8-rounds - double sha256) Vertcoin Lyra2RE Ziftrcoin (ZR5) Boolberry (Wild Keccak) -Monero (Cryptonight) +Monero (Cryptonight v7 with -a monero) Aeon (Cryptonight-lite) where some of these coins have a VERY NOTABLE nVidia advantage @@ -119,6 +121,7 @@ its command line interface and options. skein use to mine Skeincoin skein2 use to mine Woodcoin skunk use to mine Signatum + sonoa use to mine Sono stellite use to mine Stellite (a cryptonight variant) timetravel use to mine MachineCoin tribus use to mine Denarius @@ -288,6 +291,7 @@ features. June 23th 2018 v2.3 Handle phi2 header variation for smart contracts Handle monero, stellite, graft and cryptolight variants + Handle SonoA algo June 10th 2018 v2.2.6 New phi2 algo for LUX diff --git a/algos.h b/algos.h index c484bcc..dfbf7d8 100644 --- a/algos.h +++ b/algos.h @@ -52,6 +52,7 @@ enum sha_algos { ALGO_SKEIN, ALGO_SKEIN2, ALGO_SKUNK, + ALGO_SONOA, ALGO_S3, ALGO_TIMETRAVEL, ALGO_TRIBUS, @@ -129,6 +130,7 @@ static const char *algo_names[] = { "skein", "skein2", "skunk", + "sonoa", "s3", "timetravel", "tribus", diff --git a/bench.cpp b/bench.cpp index 84f9bc5..894fd8a 100644 --- a/bench.cpp +++ b/bench.cpp @@ -82,6 +82,7 @@ void algo_free_all(int thr_id) free_nist5(thr_id); free_pentablake(thr_id); free_phi(thr_id); + free_phi2(thr_id); free_polytimos(thr_id); free_quark(thr_id); free_qubit(thr_id); @@ -92,6 +93,7 @@ void algo_free_all(int thr_id) free_sha256t(thr_id); free_sia(thr_id); free_sib(thr_id); + free_sonoa(thr_id); free_s3(thr_id); free_vanilla(thr_id); free_veltor(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 6521284..c2b34f8 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -274,7 +274,7 @@ Options:\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ - phi LUX initial algo\n\ + phi1612 LUX initial algo, for Seraph\n\ phi2 LUX v2 with lyra2\n\ polytimos Politimos\n\ quark Quark\n\ @@ -288,6 +288,7 @@ Options:\n\ skein Skein SHA2 (Skeincoin)\n\ skein2 Double Skein (Woodcoin)\n\ skunk Skein Cube Fugue Streebog\n\ + sonoa 97 hashes based on X17 ones (Sono)\n\ stellite Cryptonight v3\n\ s3 S3 (1Coin)\n\ timetravel Machinecoin permuted x8\n\ @@ -2299,6 +2300,7 @@ static void *miner_thread(void *userdata) case ALGO_NEOSCRYPT: case ALGO_SIB: case ALGO_SCRYPT: + case ALGO_SONOA: case ALGO_VELTOR: minmax = 0x80000; break; @@ -2508,6 +2510,9 @@ static void *miner_thread(void *userdata) case ALGO_SIB: rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_SONOA: + rc = scanhash_sonoa(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_S3: rc = scanhash_s3(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index c0aa954..f3d3e28 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -591,7 +591,6 @@ - @@ -604,6 +603,8 @@ compute_50,sm_50;compute_52,sm_52 + + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 667331a..a1b9e86 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -778,6 +778,9 @@ Source Files\CUDA\x17 + + Source Files\CUDA\x17 + Source Files\CUDA\x17 diff --git a/miner.h b/miner.h index 86088cb..368b3cb 100644 --- a/miner.h +++ b/miner.h @@ -315,6 +315,7 @@ extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skunk(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -384,6 +385,7 @@ extern void free_skeincoin(int thr_id); extern void free_skein2(int thr_id); extern void free_skunk(int thr_id); extern void free_s3(int thr_id); +extern void free_sonoa(int thr_id); extern void free_timetravel(int thr_id); extern void free_tribus(int thr_id); extern void free_bitcore(int thr_id); diff --git a/x17/sonoa.cu b/x17/sonoa.cu new file mode 100644 index 0000000..153f787 --- /dev/null +++ b/x17/sonoa.cu @@ -0,0 +1,632 @@ +/** + * x97 SONO + **/ + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" +#include "sph/sph_whirlpool.h" +#include "sph/sph_sha2.h" +#include "sph/sph_haval.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "x11/cuda_x11.h" + +#define NBN 2 + +static uint32_t *d_hash[MAX_GPUS]; + +extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); + +extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); + +extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag); +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_free(int thr_id); + +extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); +extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); + +extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); +extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen); + +// CPU Hash Validation +extern "C" void sonoa_hash(void *output, const void *input) +{ + unsigned char _ALIGN(128) hash[64]; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + sph_hamsi512_context ctx_hamsi; + sph_fugue512_context ctx_fugue; + sph_shabal512_context ctx_shabal; + sph_whirlpool_context ctx_whirlpool; + sph_sha512_context ctx_sha512; + sph_haval256_5_context ctx_haval; + + + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, input, 80); + sph_blake512_close(&ctx_blake, (void*)hash); + + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, (const void*)hash, 64); + sph_bmw512_close(&ctx_bmw, (void*)hash); + + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, (const void*)hash, 64); + sph_groestl512_close(&ctx_groestl, (void*)hash); + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, (const void*)hash, 64); + sph_keccak512_close(&ctx_keccak, (void*)hash); + + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, (const void*)hash, 64); + sph_luffa512_close(&ctx_luffa, (void*)hash); + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, (const void*)hash, 64); + sph_simd512_close(&ctx_simd, (void*)hash); + + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + + sph_bmw512(&ctx_bmw, (const void*)hash, 64); + sph_bmw512_close(&ctx_bmw, (void*)hash); + + sph_groestl512(&ctx_groestl, (const void*)hash, 64); + sph_groestl512_close(&ctx_groestl, (void*)hash); + + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_keccak512(&ctx_keccak, (const void*)hash, 64); + sph_keccak512_close(&ctx_keccak, (void*)hash); + + sph_luffa512(&ctx_luffa, (const void*)hash, 64); + sph_luffa512_close(&ctx_luffa, (void*)hash); + + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + sph_simd512(&ctx_simd, (const void*)hash, 64); + sph_simd512_close(&ctx_simd, (void*)hash); + + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); + sph_hamsi512_close(&ctx_hamsi, (void*)hash); + + + sph_bmw512(&ctx_bmw, (const void*)hash, 64); + sph_bmw512_close(&ctx_bmw, (void*)hash); + + sph_groestl512(&ctx_groestl, (const void*)hash, 64); + sph_groestl512_close(&ctx_groestl, (void*)hash); + + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_keccak512(&ctx_keccak, (const void*)hash, 64); + sph_keccak512_close(&ctx_keccak, (void*)hash); + + sph_luffa512(&ctx_luffa, (const void*)hash, 64); + sph_luffa512_close(&ctx_luffa, (void*)hash); + + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + sph_simd512(&ctx_simd, (const void*)hash, 64); + sph_simd512_close(&ctx_simd, (void*)hash); + + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); + sph_hamsi512_close(&ctx_hamsi, (void*)hash); + + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, (const void*)hash, 64); + sph_fugue512_close(&ctx_fugue, (void*)hash); + + + sph_bmw512(&ctx_bmw, (const void*)hash, 64); + sph_bmw512_close(&ctx_bmw, (void*)hash); + + sph_groestl512(&ctx_groestl, (const void*)hash, 64); + sph_groestl512_close(&ctx_groestl, (void*)hash); + + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_keccak512(&ctx_keccak, (const void*)hash, 64); + sph_keccak512_close(&ctx_keccak, (void*)hash); + + sph_luffa512(&ctx_luffa, (const void*)hash, 64); + sph_luffa512_close(&ctx_luffa, (void*)hash); + + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + sph_simd512(&ctx_simd, (const void*)hash, 64); + sph_simd512_close(&ctx_simd, (void*)hash); + + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); + sph_hamsi512_close(&ctx_hamsi, (void*)hash); + + sph_fugue512(&ctx_fugue, (const void*)hash, 64); + sph_fugue512_close(&ctx_fugue, (void*)hash); + + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, (const void*)hash, 64); + sph_shabal512_close(&ctx_shabal, (void*)hash); + + sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); + sph_hamsi512_close(&ctx_hamsi, (void*)hash); + + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + + sph_bmw512(&ctx_bmw, (const void*)hash, 64); + sph_bmw512_close(&ctx_bmw, (void*)hash); + + sph_shabal512(&ctx_shabal, (const void*)hash, 64); + sph_shabal512_close(&ctx_shabal, (void*)hash); + + sph_groestl512(&ctx_groestl, (const void*)hash, 64); + sph_groestl512_close(&ctx_groestl, (void*)hash); + + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_keccak512(&ctx_keccak, (const void*)hash, 64); + sph_keccak512_close(&ctx_keccak, (void*)hash); + + sph_luffa512(&ctx_luffa, (const void*)hash, 64); + sph_luffa512_close(&ctx_luffa, (void*)hash); + + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + sph_simd512(&ctx_simd, (const void*)hash, 64); + sph_simd512_close(&ctx_simd, (void*)hash); + + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); + sph_hamsi512_close(&ctx_hamsi, (void*)hash); + + sph_fugue512(&ctx_fugue, (const void*)hash, 64); + sph_fugue512_close(&ctx_fugue, (void*)hash); + + sph_shabal512(&ctx_shabal, (const void*)hash, 64); + sph_shabal512_close(&ctx_shabal, (void*)hash); + + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64); + sph_whirlpool_close(&ctx_whirlpool, (void*)hash); + + + sph_bmw512(&ctx_bmw, (const void*)hash, 64); + sph_bmw512_close(&ctx_bmw, (void*)hash); + + sph_groestl512(&ctx_groestl, (const void*)hash, 64); + sph_groestl512_close(&ctx_groestl, (void*)hash); + + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_keccak512(&ctx_keccak, (const void*)hash, 64); + sph_keccak512_close(&ctx_keccak, (void*)hash); + + sph_luffa512(&ctx_luffa, (const void*)hash, 64); + sph_luffa512_close(&ctx_luffa, (void*)hash); + + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + sph_simd512(&ctx_simd, (const void*)hash, 64); + sph_simd512_close(&ctx_simd, (void*)hash); + + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); + sph_hamsi512_close(&ctx_hamsi, (void*)hash); + + sph_fugue512(&ctx_fugue, (const void*)hash, 64); + sph_fugue512_close(&ctx_fugue, (void*)hash); + + sph_shabal512(&ctx_shabal, (const void*)hash, 64); + sph_shabal512_close(&ctx_shabal, (void*)hash); + + sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64); + sph_whirlpool_close(&ctx_whirlpool, (void*)hash); + + sph_sha512_init(&ctx_sha512); + sph_sha512(&ctx_sha512, (const void*)hash, 64); + sph_sha512_close(&ctx_sha512, (void*)hash); + + sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64); + sph_whirlpool_close(&ctx_whirlpool, (void*)hash); + + + sph_bmw512(&ctx_bmw, (const void*)hash, 64); + sph_bmw512_close(&ctx_bmw, (void*)hash); + + sph_groestl512(&ctx_groestl, (const void*)hash, 64); + sph_groestl512_close(&ctx_groestl, (void*)hash); + + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_keccak512(&ctx_keccak, (const void*)hash, 64); + sph_keccak512_close(&ctx_keccak, (void*)hash); + + sph_luffa512(&ctx_luffa, (const void*)hash, 64); + sph_luffa512_close(&ctx_luffa, (void*)hash); + + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_shavite512(&ctx_shavite, (const void*)hash, 64); + sph_shavite512_close(&ctx_shavite, (void*)hash); + + sph_simd512(&ctx_simd, (const void*)hash, 64); + sph_simd512_close(&ctx_simd, (void*)hash); + + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_hamsi512(&ctx_hamsi, (const void*)hash, 64); + sph_hamsi512_close(&ctx_hamsi, (void*)hash); + + sph_fugue512(&ctx_fugue, (const void*)hash, 64); + sph_fugue512_close(&ctx_fugue, (void*)hash); + + sph_shabal512(&ctx_shabal, (const void*)hash, 64); + sph_shabal512_close(&ctx_shabal, (void*)hash); + + sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64); + sph_whirlpool_close(&ctx_whirlpool, (void*)hash); + + sph_sha512(&ctx_sha512, (const void*)hash, 64); + sph_sha512_close(&ctx_sha512, (void*)hash); + + sph_haval256_5_init(&ctx_haval); + sph_haval256_5(&ctx_haval, (const void*)hash, 64); + sph_haval256_5_close(&ctx_haval, (void*)hash); + + memcpy(output, hash, 32); +} + +#define x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash) \ + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \ + if (use_compat_kernels[thr_id]) x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \ + else x16_echo512_cpu_hash_64(thr_id, throughput, d_hash) + + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + + uint32_t default_throughput = 1 << 18; + if (device_sm[dev_id] <= 500) default_throughput = 1 << 18; + else if (device_sm[dev_id] <= 520) default_throughput = 1 << 18; + else if (device_sm[dev_id] > 520) default_throughput = (1 << 19) + (1 << 18); + + uint32_t throughput = cuda_default_throughput(thr_id, default_throughput); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + throughput &= 0xFFFFFF00; + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x00ff; + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + x11_luffaCubehash512_cpu_init(thr_id, throughput); + x11_shavite512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x17_sha512_cpu_init(thr_id, throughput); + x17_haval256_cpu_init(thr_id, throughput); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput)); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + int warn = 0; + uint32_t _ALIGN(64) endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + cuda_check_cpu_setTarget(ptarget); + + do { + int order = 0; + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 256); order++; + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + sonoa_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) + { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + sonoa_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!warn) { + warn++; + pdata[19] = work->nonces[0] + 1; + continue; + } else { + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + warn = 0; + } + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +extern "C" void free_sonoa(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaDeviceSynchronize(); + + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x15_whirlpool_cpu_free(thr_id); + + cudaDeviceSynchronize(); + init[thr_id] = false; +} From 654e8a10ec3d5924099a68f0d7ef3d928f126b8f Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 24 Jun 2018 11:54:49 +0200 Subject: [PATCH 16/24] fix g++ 7.3 warnings (ubuntu 18.04) --- api.cpp | 2 +- scrypt.cpp | 14 ++++++++++++-- scrypt/test_kernel.cu | 5 ++--- scrypt/titan_kernel.cu | 4 ++-- sia/sia-rpc.cpp | 8 ++++---- util.cpp | 4 ++-- 6 files changed, 23 insertions(+), 14 deletions(-) diff --git a/api.cpp b/api.cpp index 9014f3a..6edfd31 100644 --- a/api.cpp +++ b/api.cpp @@ -257,7 +257,7 @@ static char *getpoolnfo(char *params) static void gpuhwinfos(int gpu_id) { - char buf[256]; + char buf[512]; char pstate[8]; char* card; struct cgpu_info *cgpu = NULL; diff --git a/scrypt.cpp b/scrypt.cpp index a6b9b70..68e81e4 100644 --- a/scrypt.cpp +++ b/scrypt.cpp @@ -50,7 +50,17 @@ using namespace Concurrency; #if _MSC_VER > 1800 #undef _THROW1 +#if __cplusplus < 201101L #define _THROW1(x) throw(std::bad_alloc) +#else +#define _THROW1(x) noexcept(false) +#endif +#elif !defined(_MSC_VER) +#if __cplusplus < 201101L +#define _THROW1(x) throw(std::bad_alloc) +#else +#define _THROW1(x) noexcept(false) +#endif #endif // A thin wrapper around the builtin __m128i type @@ -63,9 +73,9 @@ public: void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); } void operator delete[](void *p) { _aligned_free(p); } #else - void * operator new(size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); } + void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); } void operator delete(void *p) { free(p); } - void * operator new[](size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); } + void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); } void operator delete[](void *p) { free(p); } #endif uint32x4_t() { }; diff --git a/scrypt/test_kernel.cu b/scrypt/test_kernel.cu index e4467d1..ab5b03c 100644 --- a/scrypt/test_kernel.cu +++ b/scrypt/test_kernel.cu @@ -47,7 +47,7 @@ texture texRef2D_4_V; template __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3); -static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) { +static __device__ uint4& operator^=(uint4& left, const uint4& right) { left.x ^= right.x; left.y ^= right.y; left.z ^= right.z; @@ -55,7 +55,7 @@ static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) { return left; } -static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) { +static __device__ uint4& operator+=(uint4& left, const uint4& right) { left.x += right.x; left.y += right.y; left.z += right.z; @@ -63,7 +63,6 @@ static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) { return left; } - /* write_keys writes the 8 keys being processed by a warp to the global * scratchpad. To effectively use memory bandwidth, it performs the writes * (and reads, for read_keys) 128 bytes at a time per memory location diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu index 1758722..57672a2 100644 --- a/scrypt/titan_kernel.cu +++ b/scrypt/titan_kernel.cu @@ -50,7 +50,7 @@ __constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1 template __device__ __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3); -static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) { +static __device__ uint4& operator ^= (uint4& left, const uint4& right) { left.x ^= right.x; left.y ^= right.y; left.z ^= right.z; @@ -58,7 +58,7 @@ static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) return left; } -static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) { +static __device__ uint4& operator += (uint4& left, const uint4& right) { left.x += right.x; left.y += right.y; left.z += right.z; diff --git a/sia/sia-rpc.cpp b/sia/sia-rpc.cpp index 5eafe9e..4770426 100644 --- a/sia/sia-rpc.cpp +++ b/sia/sia-rpc.cpp @@ -74,10 +74,10 @@ char* sia_getheader(CURL *curl, struct pool_infos *pool) struct data_buffer all_data = { 0 }; struct curl_slist *headers = NULL; char data[256] = { 0 }; - char url[512]; + char url[512*3]; // nanopool - snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll + snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s", //&longpoll pool->url, pool->user, pool->pass); if (opt_protocol) @@ -148,7 +148,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work) struct data_buffer all_data = { 0 }; struct curl_slist *headers = NULL; char buf[256] = { 0 }; - char url[512]; + char url[512*3]; if (opt_protocol) applog_hex(work->data, 80); @@ -156,7 +156,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work) //applog_hex(&work->data[10], 4); // nanopool - snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", + snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s", pool->url, pool->user, pool->pass); if (opt_protocol) diff --git a/util.cpp b/util.cpp index 9c2194d..66617af 100644 --- a/util.cpp +++ b/util.cpp @@ -616,7 +616,7 @@ err_out: json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req, bool longpoll_scan, bool longpoll, int *curl_err) { - char userpass[512]; + char userpass[768]; // todo, malloc and store that in pool array snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user, strlen(pool->pass)?':':'\0', pool->pass); @@ -627,7 +627,7 @@ json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req, /* called only from longpoll thread, we have the lp_url */ json_t *json_rpc_longpoll(CURL *curl, char *lp_url, struct pool_infos *pool, const char *req, int *curl_err) { - char userpass[512]; + char userpass[768]; snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user, strlen(pool->pass)?':':'\0', pool->pass); From 370684f7435d1256cbabef4410a57ed5bc705fdc Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 24 Jun 2018 12:25:42 +0200 Subject: [PATCH 17/24] cryptonight: some code finitions --- crypto/cryptolight-cpu.cpp | 42 +++++++++++--------------- crypto/cryptonight-cpu.cpp | 60 +++++++++++++++++--------------------- crypto/cryptonight.cu | 8 ++--- miner.h | 4 +-- 4 files changed, 50 insertions(+), 64 deletions(-) diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp index f995b4c..14cd3af 100644 --- a/crypto/cryptolight-cpu.cpp +++ b/crypto/cryptolight-cpu.cpp @@ -22,16 +22,6 @@ struct cryptonight_ctx { oaes_ctx* aes_ctx; }; - -static void cryptolight_store_variant(void* state, int variant) { - if (variant == 1) { - // use variant 1 like monero since june 2018 - const uint8_t tmp = ((const uint8_t*)(state))[11]; - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); - } -} - static void do_blake_hash(const void* input, int len, void* output) { uchar hash[32]; @@ -145,7 +135,6 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) { uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; hi += ((uint64_t*) c)[0]; - ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) dst)[0] = hi; @@ -167,11 +156,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; } -static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant) +static void cryptolight_store_variant(void* state, int variant) { + if (variant == 1) { + // use variant 1 like monero since june 2018 + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); + } +} + +static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant) { size_t i, j; - if (variant && len < 43) - return 0; keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); @@ -181,8 +177,8 @@ static int cryptolight_hash_ctx(void* output, const void* input, const int len, oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { -#undef RND -#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data); + #undef RND + #define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data); RND(0); RND(1); RND(2); @@ -202,23 +198,21 @@ static int cryptolight_hash_ctx(void* output, const void* input, const int len, aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); cryptolight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak); j = e2i(ctx->a); aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); cryptolight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak); } memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { -#undef RND -#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \ - aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data); + #undef RND + #define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \ + aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data); RND(0); RND(1); RND(2); @@ -236,15 +230,13 @@ static int cryptolight_hash_ctx(void* output, const void* input, const int len, if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); oaes_free((OAES_CTX **) &ctx->aes_ctx); - return 1; } -int cryptolight_hash_variant(void* output, const void* input, int len, int variant) +void cryptolight_hash_variant(void* output, const void* input, int len, int variant) { struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); - int rc = cryptolight_hash_ctx(output, input, len, ctx, variant); + cryptolight_hash_ctx(output, input, len, ctx, variant); free(ctx); - return rc; } void cryptolight_hash(void* output, const void* input) diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp index b60798f..582d096 100644 --- a/crypto/cryptonight-cpu.cpp +++ b/crypto/cryptonight-cpu.cpp @@ -12,20 +12,6 @@ extern "C" { #include "cpu/c_keccak.h" } -static void cryptonight_store_variant(void* state, int variant) { - if (variant == 1 || cryptonight_fork == 8) { - // monero, and graft ? - const uint8_t tmp = ((const uint8_t*)(state))[11]; - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); - } else if (variant == 2 && cryptonight_fork == 3) { - // stellite - const uint8_t tmp = ((const uint8_t*)(state))[11]; - const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30); - } -} - struct cryptonight_ctx { uint8_t long_state[MEMORY]; union cn_slow_hash_state state; @@ -144,14 +130,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui ((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; } -static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) { +static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) { uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; hi += ((uint64_t*) c)[0]; ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) dst)[0] = hi; - ((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo; + ((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo; } static void copy_block(uint8_t* dst, const uint8_t* src) { @@ -169,22 +155,34 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; } -static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant) +static void cryptonight_store_variant(void* state, int variant) { + if (variant == 1 || cryptonight_fork == 8) { + // monero and graft + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); + } else if (variant == 2 && cryptonight_fork == 3) { + // stellite + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30); + } +} + +static void cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant) { size_t i, j; - if (variant && len < 43) - return 0; keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); - const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0; + const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0; oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { -#undef RND -#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data); + #undef RND + #define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data); RND(0); RND(1); RND(2); @@ -204,23 +202,21 @@ static int cryptonight_hash_ctx(void* output, const void* input, const size_t le aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); cryptonight_store_variant(&ctx->long_state[j], variant); - - mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2); + mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak); j = e2i(ctx->a) * AES_BLOCK_SIZE; aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); cryptonight_store_variant(&ctx->long_state[j], variant); - - mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2); + mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak); } memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { -#undef RND -#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \ - aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data); + #undef RND + #define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \ + aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data); RND(0); RND(1); RND(2); @@ -238,15 +234,13 @@ static int cryptonight_hash_ctx(void* output, const void* input, const size_t le if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); oaes_free((OAES_CTX **) &ctx->aes_ctx); - return 1; } -int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant) +void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant) { struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); - int rc = cryptonight_hash_ctx(output, input, len, ctx, variant); + cryptonight_hash_ctx(output, input, len, ctx, variant); free(ctx); - return rc; } void cryptonight_hash(void* output, const void* input) diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index 5f92972..52d0e97 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -128,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); memcpy(tempdata, pdata, 76); *tempnonceptr = resNonces[0]; - const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant); - if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) + cryptonight_hash_variant(vhash, tempdata, 76, variant); + if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { res = 1; work->nonces[0] = resNonces[0]; @@ -138,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ if(resNonces[1] != UINT32_MAX) { *tempnonceptr = resNonces[1]; - const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant); - if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) { + cryptonight_hash_variant(vhash, tempdata, 76, variant); + if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { res++; work->nonces[1] = resNonces[1]; } else { diff --git a/miner.h b/miner.h index 368b3cb..f866cd9 100644 --- a/miner.h +++ b/miner.h @@ -902,9 +902,9 @@ void blake2b_hash(void *output, const void *input); void blake2s_hash(void *output, const void *input); void bmw_hash(void *state, const void *input); void c11hash(void *output, const void *input); -int cryptolight_hash_variant(void* output, const void* input, int len, int variant); +void cryptolight_hash_variant(void* output, const void* input, int len, int variant); void cryptolight_hash(void* output, const void* input); -int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant); +void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant); void cryptonight_hash(void* output, const void* input); void monero_hash(void* output, const void* input); void stellite_hash(void* output, const void* input); From 4a76ca5cb6e1f555621effec3880465124f2e386 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 3 Aug 2018 20:01:14 +0200 Subject: [PATCH 18/24] bench: handle cryptonight variants + V100 fix --- bench.cpp | 16 ++++++++++++++++ ccminer.cpp | 3 +++ crypto/cryptonight.cu | 9 ++++++--- equi/equihash.cpp | 3 +-- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/bench.cpp b/bench.cpp index 894fd8a..e573a08 100644 --- a/bench.cpp +++ b/bench.cpp @@ -156,6 +156,22 @@ bool bench_algo_switch_next(int thr_id) if (algo == ALGO_SCRYPT) algo++; if (algo == ALGO_SCRYPT_JANE) algo++; + // Set cryptonight variant + switch (algo) { + case ALGO_MONERO: + cryptonight_fork = 7; + break; + case ALGO_GRAFT: + cryptonight_fork = 8; + break; + case ALGO_STELLITE: + cryptonight_fork = 3; + break; + case ALGO_CRYPTONIGHT: + cryptonight_fork = 1; + break; + } + // free current algo memory and track mem usage mused = cuda_available_memory(thr_id); algo_free_all(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index c2b34f8..f4c1039 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -2384,6 +2384,9 @@ static void *miner_thread(void *userdata) case ALGO_CRYPTOLIGHT: rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1); break; + case ALGO_MONERO: + case ALGO_STELLITE: + case ALGO_GRAFT: case ALGO_CRYPTONIGHT: { int cn_variant = 0; diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index 52d0e97..2c3a6cd 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -50,8 +50,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id], mem, device_mpcount[dev_id]); - if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) { - device_config[thr_id] = strdup("80x24"); + if (!device_config[thr_id]) { + if(strcmp(device_name[dev_id], "TITAN V") == 0) + device_config[thr_id] = strdup("80x24"); + if(strstr(device_name[dev_id], "V100")) + device_config[thr_id] = strdup("80x24"); } if (device_config[thr_id]) { @@ -83,7 +86,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ CUDA_LOG_ERROR(); } - const size_t alloc = MEMORY * throughput; + const size_t alloc = MEMORY * size_t(throughput); cryptonight_extra_init(thr_id); cudaMalloc(&d_long_state[thr_id], alloc); diff --git a/equi/equihash.cpp b/equi/equihash.cpp index c9ac1fc..3209546 100644 --- a/equi/equihash.cpp +++ b/equi/equihash.cpp @@ -183,8 +183,7 @@ extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_non return -1; } size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024); - gpus_intensity[thr_id] = (uint32_t) solvers[thr_id]->throughput; - api_set_throughput(thr_id, gpus_intensity[thr_id]); + api_set_throughput(thr_id, (uint32_t) solvers[thr_id]->throughput); gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz); cuda_get_arch(thr_id); init[thr_id] = true; From 1f5efa7d3622f7c9efdf0d67e00200483a42e891 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 20 Sep 2018 20:13:31 +0200 Subject: [PATCH 19/24] makefile: add new cuda arch and remove sm5.0 by default --- Makefile.am | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile.am b/Makefile.am index ddfbec6..1d13556 100644 --- a/Makefile.am +++ b/Makefile.am @@ -116,9 +116,11 @@ endif ccminer_LDADD += -lcuda nvcc_ARCH := -#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\" +#nvcc_ARCH += -gencode=arch=compute_75,code=\"sm_75,compute_75\" # CUDA 10 req. +#nvcc_ARCH += -gencode=arch=compute_70,code=\"sm_70,compute_70\" # CUDA 9.1 +#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\" # CUDA 8 nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\" -nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\" +#nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\" #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\" #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\" From b36d174554591ca3572529846abd1d84df4cb41f Mon Sep 17 00:00:00 2001 From: opensourcerulez Date: Mon, 22 Oct 2018 22:11:33 +0300 Subject: [PATCH 20/24] Add exosis algo (#69) --- Makefile.am | 2 +- README.txt | 1 + algos.h | 2 + bench.cpp | 1 + ccminer.cpp | 6 + ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 3 + miner.h | 3 + util.cpp | 3 + x11/exosis.cu | 497 ++++++++++++++++++++++++++++++++++++++++ 10 files changed, 518 insertions(+), 1 deletion(-) create mode 100644 x11/exosis.cu diff --git a/Makefile.am b/Makefile.am index 1d13556..d34ac78 100644 --- a/Makefile.am +++ b/Makefile.am @@ -71,7 +71,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ tribus/tribus.cu tribus/cuda_echo512_final.cu \ x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ - x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ + x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu x11/exosis.cu \ x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \ diff --git a/README.txt b/README.txt index cb60fca..285b04d 100644 --- a/README.txt +++ b/README.txt @@ -89,6 +89,7 @@ its command line interface and options. deep use to mine Deepcoin dmd-gr use to mine Diamond-Groestl equihash use to mine ZEC, HUSH and KMD + exosis use to mine EXO fresh use to mine Freshcoin fugue256 use to mine Fuguecoin groestl use to mine Groestlcoin diff --git a/algos.h b/algos.h index dfbf7d8..2d2da2d 100644 --- a/algos.h +++ b/algos.h @@ -18,6 +18,7 @@ enum sha_algos { ALGO_DECRED, ALGO_DMD_GR, ALGO_EQUIHASH, + ALGO_EXOSIS, ALGO_FRESH, ALGO_FUGUE256, /* Fugue256 */ ALGO_GROESTL, @@ -96,6 +97,7 @@ static const char *algo_names[] = { "decred", "dmd-gr", "equihash", + "exosis", "fresh", "fugue256", "groestl", diff --git a/bench.cpp b/bench.cpp index e573a08..be53bbc 100644 --- a/bench.cpp +++ b/bench.cpp @@ -61,6 +61,7 @@ void algo_free_all(int thr_id) free_decred(thr_id); free_deep(thr_id); free_equihash(thr_id); + free_exosis(thr_id); free_keccak256(thr_id); free_fresh(thr_id); free_fugue256(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index f4c1039..bf5399c 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -252,6 +252,7 @@ Options:\n\ decred Decred Blake256\n\ deep Deepcoin\n\ equihash Zcash Equihash\n\ + exosis Exosis timetravel\n\ dmd-gr Diamond-Groestl\n\ fresh Freshcoin (shavite 80)\n\ fugue256 Fuguecoin\n\ @@ -1742,6 +1743,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_PHI2: case ALGO_TIMETRAVEL: case ALGO_BITCORE: + case ALGO_EXOSIS: case ALGO_X16R: case ALGO_X16S: work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty)); @@ -2283,6 +2285,7 @@ static void *miner_thread(void *userdata) case ALGO_SKUNK: case ALGO_TIMETRAVEL: case ALGO_BITCORE: + case ALGO_EXOSIS: case ALGO_X11EVO: case ALGO_X11: case ALGO_X12: @@ -2544,6 +2547,9 @@ static void *miner_thread(void *userdata) case ALGO_BITCORE: rc = scanhash_bitcore(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_EXOSIS: + rc = scanhash_exosis(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_X11EVO: rc = scanhash_x11evo(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index f3d3e28..01a598f 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -576,6 +576,7 @@ + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index a1b9e86..88252ec 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -841,6 +841,9 @@ Source Files\CUDA\x11 + + Source Files\CUDA\x11 + Source Files\CUDA\x11 diff --git a/miner.h b/miner.h index f866cd9..cbc766b 100644 --- a/miner.h +++ b/miner.h @@ -319,6 +319,7 @@ extern int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, uns extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds); extern int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -354,6 +355,7 @@ extern void free_cryptonight(int thr_id); extern void free_decred(int thr_id); extern void free_deep(int thr_id); extern void free_equihash(int thr_id); +extern void free_exosis(int thr_id); extern void free_keccak256(int thr_id); extern void free_fresh(int thr_id); extern void free_fugue256(int thr_id); @@ -944,6 +946,7 @@ void skunk_hash(void *state, const void *input); void s3hash(void *output, const void *input); void timetravel_hash(void *output, const void *input); void bitcore_hash(void *output, const void *input); +void exosis_hash(void *output, const void *input); void tribus_hash(void *output, const void *input); void veltorhash(void *output, const void *input); void wcoinhash(void *state, const void *input); diff --git a/util.cpp b/util.cpp index 66617af..7a67ea6 100644 --- a/util.cpp +++ b/util.cpp @@ -2311,6 +2311,9 @@ void print_hash_tests(void) bitcore_hash(&hash[0], &buf[0]); printpfx("bitcore", hash); + + exosis_hash(&hash[0], &buf[0]); + printpfx("exosis", hash); blake256hash(&hash[0], &buf[0], 8); printpfx("vanilla", hash); diff --git a/x11/exosis.cu b/x11/exosis.cu new file mode 100644 index 0000000..e4dcfe5 --- /dev/null +++ b/x11/exosis.cu @@ -0,0 +1,497 @@ +/** + * Timetravel (exosis) CUDA implementation + * by tpruvot@github, exosis + */ + +#include +#include +#include + +#define HASH_FUNC_BASE_TIMESTAMP 1538556426U +#define HASH_FUNC_COUNT 8 +#define HASH_FUNC_COUNT_PERMUTATIONS 40320U + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "cuda_x11.h" + +static uint32_t *d_hash[MAX_GPUS]; + +enum Algo { + BLAKE = 0, + BMW, + GROESTL, + SKEIN, + JH, + KECCAK, + LUFFA, + CUBEHASH, + MAX_ALGOS_COUNT +}; + +static const char* algo_strings[] = { + "blake", + "bmw512", + "groestl", + "skein", + "jh512", + "keccak", + "luffa", + "cube", + NULL +}; + +inline void swap8(uint8_t *a, uint8_t *b) +{ + uint8_t t = *a; + *a = *b; + *b = t; +} + +inline void initPerm(uint8_t n[], int count) +{ + for (int i = 0; i < count; i++) + n[i] = i; +} + +static int nextPerm(uint8_t n[], int count) +{ + int tail, i, j; + + if (count <= 1) + return 0; + + for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--); + tail = i; + + if (tail > 0) { + for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--); + swap8(&n[tail - 1], &n[j]); + } + + for (i = tail, j = count - 1; i= 10) + sprintf(sptr, "%c", 'A' + (algoList[j] - 10)); + else + sprintf(sptr, "%u", (uint32_t) algoList[j]); + sptr++; + } + *sptr = '\0'; +} + +static __thread uint32_t s_ntime = 0; +static uint32_t s_sequence = UINT32_MAX; +static uint8_t s_firstalgo = 0xFF; +static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 }; + +#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP +static inline uint32_t getCurrentAlgoSeq(uint32_t ntime) +{ + // unlike x11evo, the permutation changes often (with ntime) + return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS; +} + +// To finish... +static void get_travel_order(uint32_t ntime, char *permstr) +{ + uint32_t seq = getCurrentAlgoSeq(ntime); + if (s_sequence != seq) { + getAlgoString(permstr, seq); + s_sequence = seq; + } +} + +// CPU Hash +extern "C" void exosis_hash(void *output, const void *input) +{ + uint32_t _ALIGN(64) hash[64/4] = { 0 }; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_skein512_context ctx_skein; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_luffa512_context ctx_luffa1; + sph_cubehash512_context ctx_cubehash1; + + if (s_sequence == UINT32_MAX) { + uint32_t *data = (uint32_t*) input; + const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17]; + get_travel_order(ntime, hashOrder); + } + + void *in = (void*) input; + int size = 80; + + const int hashes = (int) strlen(hashOrder); + + for (int i = 0; i < hashes; i++) + { + const char elem = hashOrder[i]; + uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo) { + case BLAKE: + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, in, size); + sph_blake512_close(&ctx_blake, hash); + break; + case BMW: + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, in, size); + sph_bmw512_close(&ctx_bmw, hash); + break; + case GROESTL: + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, in, size); + sph_groestl512_close(&ctx_groestl, hash); + break; + case SKEIN: + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, in, size); + sph_skein512_close(&ctx_skein, hash); + break; + case JH: + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, in, size); + sph_jh512_close(&ctx_jh, hash); + break; + case KECCAK: + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, in, size); + sph_keccak512_close(&ctx_keccak, hash); + break; + case LUFFA: + sph_luffa512_init(&ctx_luffa1); + sph_luffa512(&ctx_luffa1, in, size); + sph_luffa512_close(&ctx_luffa1, hash); + break; + case CUBEHASH: + sph_cubehash512_init(&ctx_cubehash1); + sph_cubehash512(&ctx_cubehash1, in, size); + sph_cubehash512_close(&ctx_cubehash1, hash); + break; + } + + in = (void*) hash; + size = 64; + } + + memcpy(output, hash, 32); +} + +static uint32_t get_next_time(uint32_t ntime, char* curOrder) +{ + char nextOrder[HASH_FUNC_COUNT + 1] = { 0 }; + uint32_t secs = 15; + do { + uint32_t nseq = getCurrentAlgoSeq(ntime+secs); + getAlgoString(nextOrder, nseq); + secs += 15; + } while (curOrder[0] == nextOrder[0]); + return secs; +} + +//#define _DEBUG +#define _DEBUG_PREFIX "tt-" +#include "cuda_debug.cuh" + +void quark_bmw512_cpu_setBlock_80(void *pdata); +void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); + +void groestl512_setBlock_80(int thr_id, uint32_t *endiandata); +void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); + +void skein512_cpu_setBlock_80(void *pdata); +void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); + +void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); +void qubit_luffa512_cpu_setBlock_80(void *pdata); +void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); + +void jh512_setBlock_80(int thr_id, uint32_t *endiandata); +void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); + +void keccak512_setBlock_80(int thr_id, uint32_t *endiandata); +void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); + +void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata); +void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); + +void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + // if (opt_benchmark) pdata[17] = swab32(0x5886a4be); // TO DEBUG GROESTL 80 + + if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) { + uint32_t ntime = swab32(work->data[17]); + get_travel_order(ntime, hashOrder); + s_ntime = pdata[17]; + if (opt_debug && !thr_id) { + applog(LOG_DEBUG, "exosis hash order %s (%08x)", hashOrder, ntime); + } + } + + if (opt_benchmark) + ptarget[7] = 0x5; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes) + x11_luffa512_cpu_init(thr_id, throughput); + x11_cubehash512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1); + CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + cuda_check_cpu_setTarget(ptarget); + + const int hashes = (int) strlen(hashOrder); + const char first = hashOrder[0]; + const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0'; + if (algo80 != s_firstalgo) { + s_firstalgo = algo80; + applog(LOG_INFO, "Exosis first algo is now %s", algo_strings[algo80 % HASH_FUNC_COUNT]); + } + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + groestl512_setBlock_80(thr_id, endiandata); + break; + case SKEIN: + skein512_cpu_setBlock_80((void*)endiandata); + break; + case JH: + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + keccak512_setBlock_80(thr_id, endiandata); + break; + case LUFFA: + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + break; + case CUBEHASH: + cubehash512_setBlock_80(thr_id, endiandata); + break; + default: { + uint32_t next = get_next_time(swab32(s_ntime), hashOrder); + if (!thr_id) + applog(LOG_WARNING, "kernel %c unimplemented, next in %u mn", first, next/60); + sleep(next > 30 ? 60 : 10); + return -1; + } + } + + do { + int order = 0; + + // Hash with CUDA + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("blake80:"); + break; + case BMW: + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("bmw80 :"); + break; + case GROESTL: + groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("grstl80:"); + break; + case SKEIN: + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + TRACE("skein80:"); + break; + case JH: + jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("jh51280:"); + break; + case KECCAK: + keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("kecck80:"); + break; + case LUFFA: + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("luffa80:"); + break; + case CUBEHASH: + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("cube 80:"); + break; + } + + for (int i = 1; i < hashes; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo64) { + case BLAKE: + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("blake :"); + break; + case BMW: + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("bmw :"); + break; + case GROESTL: + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("groestl:"); + break; + case SKEIN: + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); + break; + case JH: + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("jh512 :"); + break; + case KECCAK: + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("keccak :"); + break; + case LUFFA: + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("luffa :"); + break; + case CUBEHASH: + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("cube :"); + break; + } + } + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + if (work->nonces[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; + const uint32_t Htarg = ptarget[7]; + be32enc(&endiandata[19], work->nonces[0]); + exosis_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + pdata[19] = work->nonces[0]; + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + exosis_hash(vhash, endiandata); + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + } + pdata[19] = max(pdata[19], work->nonces[1]) + 1; + } + return work->valid_nonces; + } else if (vhash[7] > Htarg) { + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_exosis(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} From 01e632cf05d51fdc898838976297339594db6769 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 25 Nov 2018 04:00:26 +0100 Subject: [PATCH 21/24] handle standard blake2b stratum algo no weird protocol or reversed endian like sia... --- Makefile.am | 1 + algos.h | 2 + bench.cpp | 1 + blake2b.cu | 273 ++++++++++++++++++++++++++++++++++++++++ ccminer.cpp | 5 + ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 3 + miner.h | 3 + sia/sia.cu | 18 +-- util.cpp | 5 +- 10 files changed, 302 insertions(+), 10 deletions(-) create mode 100644 blake2b.cu diff --git a/Makefile.am b/Makefile.am index d34ac78..6a15836 100644 --- a/Makefile.am +++ b/Makefile.am @@ -46,6 +46,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \ Algo256/blake2s.cu sph/blake2s.c \ Algo256/bmw.cu Algo256/cuda_bmw.cu \ + blake2b.cu \ crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \ crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \ crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \ diff --git a/algos.h b/algos.h index 2d2da2d..b084eeb 100644 --- a/algos.h +++ b/algos.h @@ -7,6 +7,7 @@ enum sha_algos { ALGO_BLAKECOIN = 0, ALGO_BLAKE, + ALGO_BLAKE2B, ALGO_BLAKE2S, ALGO_ALLIUM, ALGO_BMW, @@ -86,6 +87,7 @@ extern volatile enum sha_algos opt_algo; static const char *algo_names[] = { "blakecoin", "blake", + "blake2b", "blake2s", "allium", "bmw", diff --git a/bench.cpp b/bench.cpp index be53bbc..e2c26be 100644 --- a/bench.cpp +++ b/bench.cpp @@ -53,6 +53,7 @@ void algo_free_all(int thr_id) free_bastion(thr_id); free_bitcore(thr_id); free_blake256(thr_id); + free_blake2b(thr_id); free_blake2s(thr_id); free_bmw(thr_id); free_c11(thr_id); diff --git a/blake2b.cu b/blake2b.cu new file mode 100644 index 0000000..2be74f8 --- /dev/null +++ b/blake2b.cu @@ -0,0 +1,273 @@ +/** + * Blake2-B CUDA Implementation + * + * tpruvot@github July 2016 + * + */ + +#include + +#include +#include + +#include + +#include +#include + +#define TPB 512 +#define NBN 2 + +static uint32_t *d_resNonces[MAX_GPUS]; + +__device__ uint64_t d_data[10]; + +static __constant__ const int8_t blake2b_sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } , + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + +// host mem align +#define A 64 + +extern "C" void blake2b_hash(void *output, const void *input) +{ + uint8_t _ALIGN(A) hash[32]; + blake2b_ctx ctx; + + blake2b_init(&ctx, 32, NULL, 0); + blake2b_update(&ctx, input, 80); + blake2b_final(&ctx, hash); + + memcpy(output, hash, 32); +} + +// ---------------------------------------------------------------- + +__device__ __forceinline__ +static void G(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16]) +{ + a = a + b + m[ blake2b_sigma[r][2*i] ]; + ((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); + c = c + d; + ((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] ); + a = a + b + m[ blake2b_sigma[r][2*i+1] ]; + ((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); + c = c + d; + ((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); +} + +#define ROUND(r) \ + G(r, 0, v[0], v[4], v[ 8], v[12], m); \ + G(r, 1, v[1], v[5], v[ 9], v[13], m); \ + G(r, 2, v[2], v[6], v[10], v[14], m); \ + G(r, 3, v[3], v[7], v[11], v[15], m); \ + G(r, 4, v[0], v[5], v[10], v[15], m); \ + G(r, 5, v[1], v[6], v[11], v[12], m); \ + G(r, 6, v[2], v[7], v[ 8], v[13], m); \ + G(r, 7, v[3], v[4], v[ 9], v[14], m); + +__global__ +//__launch_bounds__(128, 8) /* to force 64 regs */ +void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2) +{ + const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce; + + uint64_t m[16]; + + m[0] = d_data[0]; + m[1] = d_data[1]; + m[2] = d_data[2]; + m[3] = d_data[3]; + m[4] = d_data[4]; + m[5] = d_data[5]; + m[6] = d_data[6]; + m[7] = d_data[7]; + m[8] = d_data[8]; + ((uint32_t*)m)[18] = AS_U32(&d_data[9]); + ((uint32_t*)m)[19] = nonce; + + m[10] = m[11] = 0; + m[12] = m[13] = 0; + m[14] = m[15] = 0; + + uint64_t v[16] = { + 0x6a09e667f2bdc928, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade68281, 0x9b05688c2b3e6c1f, 0xe07c265404be4294, 0x5be0cd19137e2179 + }; + + ROUND( 0); + ROUND( 1); + ROUND( 2); + ROUND( 3); + ROUND( 4); + ROUND( 5); + ROUND( 6); + ROUND( 7); + ROUND( 8); + ROUND( 9); + ROUND(10); + ROUND(11); + + uint2 last = vectorize(v[3] ^ v[11] ^ 0xa54ff53a5f1d36f1); + if (last.y <= target2.y && last.x <= target2.x) { + resNonce[1] = resNonce[0]; + resNonce[0] = nonce; + } +} + +__host__ +uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce) +{ + uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX }; + uint32_t result = UINT32_MAX; + + dim3 grid((threads + TPB-1)/TPB); + dim3 block(TPB); + + /* Check error on Ctrl+C or kill to prevent segfaults on exit */ + if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) + return result; + + blake2b_gpu_hash <<>> (threads, startNonce, d_resNonces[thr_id], target2); + cudaThreadSynchronize(); + + if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + result = resNonces[0]; + secNonce = resNonces[1]; + if (secNonce == result) secNonce = UINT32_MAX; + } + return result; +} + +__host__ +void blake2b_setBlock(uint32_t *data) +{ + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice)); +} + +static bool init[MAX_GPUS] = { 0 }; + +int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(A) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t first_nonce = pdata[19]; + + int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25; + if (device_sm[dev_id] >= 520 && is_windows()) intensity = 26; + if (device_sm[dev_id] < 350) intensity = 22; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage (linux) + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1); + init[thr_id] = true; + } + + for (int i=0; i < 20; i++) + be32enc(&endiandata[i], pdata[i]); + + const uint2 target = make_uint2(ptarget[6], ptarget[7]); + blake2b_setBlock(endiandata); + + do { + work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[19], target, work->nonces[1]); + + *hashes_done = pdata[19] - first_nonce + throughput; + + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(A) vhash[8]; + work->valid_nonces = 0; + endiandata[19] = work->nonces[0]; + blake2b_hash(vhash, endiandata); + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work_set_target_ratio(work, vhash); + work->valid_nonces++; + pdata[19] = work->nonces[0] + 1; + } else { + gpu_increment_reject(thr_id); + } + + if (work->nonces[1] != UINT32_MAX) { + endiandata[19] = work->nonces[1]; + blake2b_hash(vhash, endiandata); + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) { + work->sharediff[1] = work->sharediff[0]; + work->shareratio[1] = work->shareratio[0]; + xchg(work->nonces[1], work->nonces[0]); + work_set_target_ratio(work, vhash); + } else { + bn_set_target_ratio(work, vhash, 1); + } + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan start + } else { + gpu_increment_reject(thr_id); + } + } + + if (work->valid_nonces) { + work->nonces[0] = cuda_swab32(work->nonces[0]); + work->nonces[1] = cuda_swab32(work->nonces[1]); + return work->valid_nonces; + } + } + + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + + return 0; +} + +// cleanup +extern "C" void free_blake2b(int thr_id) +{ + if (!init[thr_id]) + return; + + //cudaThreadSynchronize(); + + cudaFree(d_resNonces[thr_id]); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/ccminer.cpp b/ccminer.cpp index bf5399c..46d9fac 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -243,6 +243,7 @@ Options:\n\ bastion Hefty bastion\n\ bitcore Timetravel-10\n\ blake Blake 256 (SFR)\n\ + blake2b Blake2-B 512 (BCX)\n\ blake2s Blake2-S 256 (NEVA)\n\ blakecoin Fast Blake 256 (8 rounds)\n\ bmw BMW 256\n\ @@ -2260,6 +2261,7 @@ static void *miner_thread(void *userdata) //case ALGO_WHIRLPOOLX: minmax = 0x40000000U; break; + case ALGO_BLAKE2B: case ALGO_KECCAK: case ALGO_KECCAKC: case ALGO_LBRY: @@ -2375,6 +2377,9 @@ static void *miner_thread(void *userdata) case ALGO_BLAKE: rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 14); break; + case ALGO_BLAKE2B: + rc = scanhash_blake2b(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_BLAKE2S: rc = scanhash_blake2s(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 01a598f..67820ad 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -460,6 +460,7 @@ + 64 --ptxas-options="-dlcm=cg" %(AdditionalOptions) diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 88252ec..c353d21 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -952,6 +952,9 @@ Source Files\CUDA\lyra2 + + Source Files\CUDA\ + Source Files\CUDA\Algo256 diff --git a/miner.h b/miner.h index cbc766b..bbd4c8c 100644 --- a/miner.h +++ b/miner.h @@ -276,6 +276,7 @@ struct work; extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds); +extern int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -347,6 +348,7 @@ extern void free_allium(int thr_id); extern void free_bastion(int thr_id); extern void free_bitcore(int thr_id); extern void free_blake256(int thr_id); +extern void free_blake2b(int thr_id); extern void free_blake2s(int thr_id); extern void free_bmw(int thr_id); extern void free_c11(int thr_id); @@ -939,6 +941,7 @@ void scrypthash(void* output, const void* input); void scryptjane_hash(void* output, const void* input); void sha256d_hash(void *output, const void *input); void sha256t_hash(void *output, const void *input); +void sia_blake2b_hash(void *output, const void *input); void sibhash(void *output, const void *input); void skeincoinhash(void *output, const void *input); void skein2hash(void *output, const void *input); diff --git a/sia/sia.cu b/sia/sia.cu index 8e4f483..4ffdccb 100644 --- a/sia/sia.cu +++ b/sia/sia.cu @@ -40,7 +40,7 @@ static __constant__ const int8_t blake2b_sigma[12][16] = { // host mem align #define A 64 -extern "C" void blake2b_hash(void *output, const void *input) +extern "C" void sia_blake2b_hash(void *output, const void *input) { uint8_t _ALIGN(A) hash[32]; blake2b_ctx ctx; @@ -102,7 +102,7 @@ static void H(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, u __global__ //__launch_bounds__(128, 8) /* to force 64 regs */ -void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2) +void sia_blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2) { const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce; __shared__ uint64_t s_target; @@ -154,7 +154,7 @@ void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_ } __host__ -uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce) +uint32_t sia_blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce) { uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX }; uint32_t result = UINT32_MAX; @@ -166,7 +166,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3 if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) return result; - blake2b_gpu_hash <<>> (threads, startNonce, d_resNonces[thr_id], target2); + sia_blake2b_gpu_hash <<>> (threads, startNonce, d_resNonces[thr_id], target2); cudaThreadSynchronize(); if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { @@ -178,7 +178,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3 } __host__ -void blake2b_setBlock(uint32_t *data) +void sia_blake2b_setBlock(uint32_t *data) { CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice)); } @@ -224,10 +224,10 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon const uint2 target = make_uint2(ptarget[6], ptarget[7]); - blake2b_setBlock(inputdata); + sia_blake2b_setBlock(inputdata); do { - work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]); + work->nonces[0] = sia_blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]); *hashes_done = pdata[8] - first_nonce + throughput; @@ -235,7 +235,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon { work->valid_nonces = 0; inputdata[8] = work->nonces[0]; - blake2b_hash(hash, inputdata); + sia_blake2b_hash(hash, inputdata); if (swab32(hash[0]) <= Htarg) { // sia hash target is reversed (start of hash) swab256(vhashcpu, hash); @@ -250,7 +250,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon if (work->nonces[1] != UINT32_MAX) { inputdata[8] = work->nonces[1]; - blake2b_hash(hash, inputdata); + sia_blake2b_hash(hash, inputdata); if (swab32(hash[0]) <= Htarg) { swab256(vhashcpu, hash); if (fulltest(vhashcpu, ptarget)) { diff --git a/util.cpp b/util.cpp index 7a67ea6..f661d52 100644 --- a/util.cpp +++ b/util.cpp @@ -2184,6 +2184,9 @@ void print_hash_tests(void) blake256hash(&hash[0], &buf[0], 14); printpfx("blake", hash); + blake2b_hash(&hash[0], &buf[0]); + printpfx("blake2b", hash); + blake2s_hash(&hash[0], &buf[0]); printpfx("blake2s", hash); @@ -2285,7 +2288,7 @@ void print_hash_tests(void) sha256t_hash(&hash[0], &buf[0]); printpfx("sha256t", hash); - blake2b_hash(&hash[0], &buf[0]); + sia_blake2b_hash(&hash[0], &buf[0]); printpfx("sia", hash); sibhash(&hash[0], &buf[0]); From c59bc2438a6b0404c3199972fecd45123480f792 Mon Sep 17 00:00:00 2001 From: pyritepirate <44350183+pyritepirate@users.noreply.github.com> Date: Sun, 27 Jan 2019 08:24:53 +0100 Subject: [PATCH 22/24] sha256q (#70) --- Makefile.am | 2 +- README.txt | 1 + algos.h | 2 + bench.cpp | 1 + ccminer.cpp | 6 + ccminer.vcxproj | 2 + ccminer.vcxproj.filters | 6 + miner.h | 3 + sha256/cuda_sha256q.cu | 507 ++++++++++++++++++++++++++++++++++++++++ sha256/sha256q.cu | 136 +++++++++++ util.cpp | 3 + 11 files changed, 668 insertions(+), 1 deletion(-) create mode 100644 sha256/cuda_sha256q.cu create mode 100644 sha256/sha256q.cu diff --git a/Makefile.am b/Makefile.am index 6a15836..ecc8e30 100644 --- a/Makefile.am +++ b/Makefile.am @@ -60,7 +60,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \ pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \ skunk/skunk.cu skunk/cuda_skunk.cu skunk/cuda_skunk_streebog.cu \ - sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu \ + sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu sha256/sha256q.cu sha256/cuda_sha256q.cu \ sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ diff --git a/README.txt b/README.txt index 285b04d..321bfb4 100644 --- a/README.txt +++ b/README.txt @@ -117,6 +117,7 @@ its command line interface and options. scrypt-jane use to mine Chacha coins like Cache and Ultracoin s3 use to mine 1coin (ONE) sha256t use to mine OneCoin (OC) + sha256q use to mine Pyrite sia use to mine SIA sib use to mine Sibcoin skein use to mine Skeincoin diff --git a/algos.h b/algos.h index b084eeb..aa03ecd 100644 --- a/algos.h +++ b/algos.h @@ -49,6 +49,7 @@ enum sha_algos { ALGO_SCRYPT_JANE, ALGO_SHA256D, ALGO_SHA256T, + ALGO_SHA256Q, ALGO_SIA, ALGO_SIB, ALGO_SKEIN, @@ -129,6 +130,7 @@ static const char *algo_names[] = { "scrypt-jane", "sha256d", "sha256t", + "sha256q", "sia", "sib", "skein", diff --git a/bench.cpp b/bench.cpp index e2c26be..f674f77 100644 --- a/bench.cpp +++ b/bench.cpp @@ -93,6 +93,7 @@ void algo_free_all(int thr_id) free_skunk(thr_id); free_sha256d(thr_id); free_sha256t(thr_id); + free_sha256q(thr_id); free_sia(thr_id); free_sib(thr_id); free_sonoa(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 46d9fac..596a924 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -283,6 +283,7 @@ Options:\n\ qubit Qubit\n\ sha256d SHA256d (bitcoin)\n\ sha256t SHA256 x3\n\ + sha256q SHA256 x4\n\ sia SIA (Blake2B)\n\ sib Sibcoin (X11+Streebog)\n\ scrypt Scrypt\n\ @@ -977,6 +978,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) case ALGO_BMW: case ALGO_SHA256D: case ALGO_SHA256T: + case ALGO_SHA256Q: case ALGO_VANILLA: // fast algos require that... (todo: regen hash) check_dups = true; @@ -2258,6 +2260,7 @@ static void *miner_thread(void *userdata) case ALGO_DECRED: case ALGO_SHA256D: case ALGO_SHA256T: + case ALGO_SHA256Q: //case ALGO_WHIRLPOOLX: minmax = 0x40000000U; break; @@ -2515,6 +2518,9 @@ static void *miner_thread(void *userdata) case ALGO_SHA256T: rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_SHA256Q: + rc = scanhash_sha256q(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_SIA: rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 67820ad..5ef6551 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -435,6 +435,8 @@ + + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index c353d21..8ed886a 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -982,6 +982,12 @@ Source Files\CUDA\sha256 + + Source Files\CUDA\sha256 + + + Source Files\CUDA\sha256 + Source Files\sia diff --git a/miner.h b/miner.h index bbd4c8c..7f52d55 100644 --- a/miner.h +++ b/miner.h @@ -310,6 +310,7 @@ extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, uns extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_sha256q(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -383,6 +384,7 @@ extern void free_quark(int thr_id); extern void free_qubit(int thr_id); extern void free_sha256d(int thr_id); extern void free_sha256t(int thr_id); +extern void free_sha256q(int thr_id); extern void free_sia(int thr_id); extern void free_sib(int thr_id); extern void free_skeincoin(int thr_id); @@ -941,6 +943,7 @@ void scrypthash(void* output, const void* input); void scryptjane_hash(void* output, const void* input); void sha256d_hash(void *output, const void *input); void sha256t_hash(void *output, const void *input); +void sha256q_hash(void *output, const void *input); void sia_blake2b_hash(void *output, const void *input); void sibhash(void *output, const void *input); void skeincoinhash(void *output, const void *input); diff --git a/sha256/cuda_sha256q.cu b/sha256/cuda_sha256q.cu new file mode 100644 index 0000000..80733ac --- /dev/null +++ b/sha256/cuda_sha256q.cu @@ -0,0 +1,507 @@ +/* + * sha256(-q) CUDA implementation. + * pyritepirate 2018 + * tpruvot 2017 + */ + +#include +#include +#include + +#include +#include + +__constant__ static uint32_t __align__(8) c_midstate76[8]; +__constant__ static uint32_t __align__(8) c_dataEnd80[4]; + +const __constant__ uint32_t __align__(8) c_H256[8] = { + 0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, + 0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U +}; +__constant__ static uint32_t __align__(8) c_K[64]; +__constant__ static uint32_t __align__(8) c_target[2]; +__device__ uint64_t d_target[1]; + +static uint32_t* d_resNonces[MAX_GPUS] = { 0 }; + +// ------------------------------------------------------------------------------------------------ + +static const uint32_t cpu_H256[8] = { + 0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, + 0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U +}; + +static const uint32_t cpu_K[64] = { + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 +}; + +#define ROTR ROTR32 + +__host__ +static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, + uint32_t e, uint32_t f, uint32_t g, uint32_t &h, + uint32_t in, const uint32_t Kshared) +{ + uint32_t t1,t2; + uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); + uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); + uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); + uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); + + t1 = h + bsg21 + vxandx + Kshared + in; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +__host__ +static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, + uint32_t e, uint32_t f, uint32_t g, uint32_t &h, + uint32_t* in, uint32_t pc, const uint32_t Kshared) +{ + uint32_t t1,t2; + + int pcidx1 = (pc-2) & 0xF; + int pcidx2 = (pc-7) & 0xF; + int pcidx3 = (pc-15) & 0xF; + + uint32_t inx0 = in[pc]; + uint32_t inx1 = in[pcidx1]; + uint32_t inx2 = in[pcidx2]; + uint32_t inx3 = in[pcidx3]; + + uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); + uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); + uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); + uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); + uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); + uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); + + in[pc] = ssg21 + inx2 + ssg20 + inx0; + + t1 = h + bsg21 + vxandx + Kshared + in[pc]; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +__host__ +static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared) +{ + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + uint32_t e = state[4]; + uint32_t f = state[5]; + uint32_t g = state[6]; + uint32_t h = state[7]; + + sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]); + sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]); + sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]); + sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]); + sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]); + sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]); + sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]); + sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]); + sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]); + sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]); + sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]); + sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]); + sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]); + sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]); + sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]); + sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]); + + for (int i=0; i<3; i++) + { + sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]); + sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]); + sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]); + sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]); + sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]); + sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]); + sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]); + sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]); + sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]); + sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]); + sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]); + sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]); + sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]); + sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); + sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); + sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; +} + +#define xor3b(a,b,c) (a ^ b ^ c) + +__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x) +{ + return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22)); +} + +__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x) +{ + return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25)); +} + +__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x) +{ + return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3)); +} + +__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x) +{ + return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10)); +} + +__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c) +{ + uint32_t result; + asm("{\n\t" + ".reg .u32 m,n,o;\n\t" + "and.b32 m, %1, %2;\n\t" + " or.b32 n, %1, %2;\n\t" + "and.b32 o, n, %3;\n\t" + " or.b32 %0, m, o ;\n\t" + "}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c) + ); + return result; +} + +__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) { + uint2 result; + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(result.y), "=r"(result.x) : "l"(v)); + return result; +} + +__device__ +static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, + uint32_t in, const uint32_t Kshared) +{ + uint32_t t1,t2; + uint32_t vxandx = xandx(e, f, g); + uint32_t bsg21 = bsg2_1(e); + uint32_t bsg20 = bsg2_0(a); + uint32_t andorv = andor32(a,b,c); + + t1 = h + bsg21 + vxandx + Kshared + in; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +__device__ +static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, + uint32_t* in, uint32_t pc, const uint32_t Kshared) +{ + uint32_t t1,t2; + + int pcidx1 = (pc-2) & 0xF; + int pcidx2 = (pc-7) & 0xF; + int pcidx3 = (pc-15) & 0xF; + + uint32_t inx0 = in[pc]; + uint32_t inx1 = in[pcidx1]; + uint32_t inx2 = in[pcidx2]; + uint32_t inx3 = in[pcidx3]; + + uint32_t ssg21 = ssg2_1(inx1); + uint32_t ssg20 = ssg2_0(inx3); + uint32_t vxandx = xandx(e, f, g); + uint32_t bsg21 = bsg2_1(e); + uint32_t bsg20 = bsg2_0(a); + uint32_t andorv = andor32(a,b,c); + + in[pc] = ssg21 + inx2 + ssg20 + inx0; + + t1 = h + bsg21 + vxandx + Kshared + in[pc]; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +__device__ +static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared) +{ + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + uint32_t e = state[4]; + uint32_t f = state[5]; + uint32_t g = state[6]; + uint32_t h = state[7]; + + sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]); + sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]); + sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]); + sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]); + sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]); + sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]); + sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]); + sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]); + sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]); + sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]); + sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]); + sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]); + sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]); + sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]); + sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]); + sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]); + + #pragma unroll + for (int i=0; i<3; i++) + { + sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]); + sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]); + sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]); + sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]); + sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]); + sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]); + sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]); + sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]); + sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]); + sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]); + sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]); + sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]); + sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]); + sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); + sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); + sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; +} + +__device__ +static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared) +{ + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + uint32_t e = state[4]; + uint32_t f = state[5]; + uint32_t g = state[6]; + uint32_t h = state[7]; + + sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]); + sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]); + sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]); + sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]); + sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]); + sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]); + sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]); + sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]); + sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]); + sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]); + sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]); + sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]); + sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]); + sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]); + sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]); + sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]); + + #pragma unroll + for (int i=0; i<2; i++) + { + sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]); + sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]); + sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]); + sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]); + sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]); + sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]); + sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]); + sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]); + sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]); + sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]); + sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]); + sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]); + sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]); + sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]); + sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]); + sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]); + } + + sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]); + sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]); + sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]); + sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]); + sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]); + sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]); + sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]); + sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]); + sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]); + sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]); + sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]); + sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]); + sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]); + sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]); + + state[6] += g; + state[7] += h; +} + +__device__ __forceinline__ +uint64_t cuda_swab32ll(uint64_t x) { + return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); +} + +__global__ +/*__launch_bounds__(256,3)*/ +void sha256q_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + __shared__ uint32_t s_K[64*4]; + //s_K[thread & 63] = c_K[thread & 63]; + if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x]; + + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + + uint32_t dat[16]; + AS_UINT2(dat) = AS_UINT2(c_dataEnd80); + dat[ 2] = c_dataEnd80[2]; + dat[ 3] = nonce; + dat[ 4] = 0x80000000; + dat[15] = 0x280; + #pragma unroll + for (int i=5; i<15; i++) dat[i] = 0; + + uint32_t buf[8]; + #pragma unroll + for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]); + //for (int i=0; i<8; i++) buf[i] = c_midstate76[i]; + + sha256_round_body(dat, buf, s_K); + + // second sha256 + + #pragma unroll + for (int i=0; i<8; i++) dat[i] = buf[i]; + dat[8] = 0x80000000; + #pragma unroll + for (int i=9; i<15; i++) dat[i] = 0; + dat[15] = 0x100; + + #pragma unroll + for (int i=0; i<8; i++) buf[i] = c_H256[i]; + + sha256_round_body(dat, buf, s_K); + + // third sha256 + + #pragma unroll + for (int i=0; i<8; i++) dat[i] = buf[i]; + dat[8] = 0x80000000; + #pragma unroll + for (int i=9; i<15; i++) dat[i] = 0; + dat[15] = 0x100; + + #pragma unroll + for (int i=0; i<8; i++) buf[i] = c_H256[i]; + + sha256_round_body(dat, buf, s_K); + + // last sha256 + + #pragma unroll + for (int i=0; i<8; i++) dat[i] = buf[i]; + dat[8] = 0x80000000; + #pragma unroll + for (int i=9; i<15; i++) dat[i] = 0; + dat[15] = 0x100; + + #pragma unroll + for (int i=0; i<8; i++) buf[i] = c_H256[i]; + + sha256_round_last(dat, buf, s_K); + + + // valid nonces + uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]); + if (high <= c_target[0]) { + //printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]); + resNonces[1] = atomicExch(resNonces, nonce); + //d_target[0] = high; + } + } +} + +__host__ +void sha256q_init(int thr_id) +{ + cuda_get_arch(thr_id); + cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t))); +} + +__host__ +void sha256q_free(int thr_id) +{ + if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]); + d_resNonces[thr_id] = NULL; +} + +__host__ +void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget) +{ + uint32_t _ALIGN(64) in[16], buf[8], end[4]; + for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]); + for (int i=0;i<8;i++) buf[i] = cpu_H256[i]; + for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]); + sha256_round_body_host(in, buf, cpu_K); + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80, end, sizeof(end), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice)); +} + +__host__ +void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces) +{ + const uint32_t threadsperblock = 128; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t))); + cudaThreadSynchronize(); + sha256q_gpu_hash_shared <<>> (threads, startNonce, d_resNonces[thr_id]); + cudaThreadSynchronize(); + + CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + if (resNonces[0] == resNonces[1]) { + resNonces[1] = UINT32_MAX; + } +} diff --git a/sha256/sha256q.cu b/sha256/sha256q.cu new file mode 100644 index 0000000..d3efa40 --- /dev/null +++ b/sha256/sha256q.cu @@ -0,0 +1,136 @@ +/** + * SHA256 4x + * by pyritepirate - 2018 + * by tpruvot@github - 2017 + */ + +#include +#include +#include + +// CPU Check +extern "C" void sha256q_hash(void *output, const void *input) +{ + unsigned char _ALIGN(64) hash[64]; + SHA256_CTX sha256; + + SHA256_Init(&sha256); + SHA256_Update(&sha256, (unsigned char *)input, 80); + SHA256_Final(hash, &sha256); + + SHA256_Init(&sha256); + SHA256_Update(&sha256, hash, 32); + SHA256_Final(hash, &sha256); + + SHA256_Init(&sha256); + SHA256_Update(&sha256, hash, 32); + SHA256_Final(hash, &sha256); + + SHA256_Init(&sha256); + SHA256_Update(&sha256, hash, 32); + SHA256_Final((unsigned char *)output, &sha256); +} + +static bool init[MAX_GPUS] = { 0 }; +extern void sha256q_init(int thr_id); +extern void sha256q_free(int thr_id); +extern void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget); +extern void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces); + +extern "C" int scanhash_sha256q(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(64) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << 23); + if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce)); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x03; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + sha256q_init(thr_id); + + init[thr_id] = true; + } + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + sha256q_setBlock_80(endiandata, ptarget); + + do { + // Hash with CUDA + *hashes_done = pdata[19] - first_nonce + throughput; + + sha256q_hash_80(thr_id, throughput, pdata[19], work->nonces); + if (work->nonces[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; + + endiandata[19] = swab32(work->nonces[0]); + sha256q_hash(vhash, endiandata); + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + if (work->nonces[1] != UINT32_MAX) { + endiandata[19] = swab32(work->nonces[1]); + sha256q_hash(vhash, endiandata); + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work->valid_nonces++; + bn_set_target_ratio(work, vhash, 1); + } + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; + } + return work->valid_nonces; + } + else if (vhash[7] > ptarget[7]) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + + return 0; +} + +// cleanup +extern "C" void free_sha256q(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + sha256q_free(thr_id); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/util.cpp b/util.cpp index f661d52..79799b0 100644 --- a/util.cpp +++ b/util.cpp @@ -2288,6 +2288,9 @@ void print_hash_tests(void) sha256t_hash(&hash[0], &buf[0]); printpfx("sha256t", hash); + sha256q_hash(&hash[0], &buf[0]); + printpfx("sha256q", hash); + sia_blake2b_hash(&hash[0], &buf[0]); printpfx("sia", hash); From 9a1f20d455d27c44bedcb80c30f1d2e2a50c0913 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 30 Jan 2019 14:28:23 +0100 Subject: [PATCH 23/24] Handle lyra2v3 algo, for VTC fork mostly imported from opensourced vertcoin-miner with a few fixes --- Makefile.am | 1 + README.txt | 7 +- algos.h | 4 + bench.cpp | 1 + ccminer.cpp | 8 +- ccminer.vcxproj | 3 + ccminer.vcxproj.filters | 9 + compat/ccminer-config.h | 2 +- lyra2/Lyra2.c | 173 +++++++++++++ lyra2/Lyra2.h | 1 + lyra2/cuda_lyra2v3.cu | 481 +++++++++++++++++++++++++++++++++++++ lyra2/cuda_lyra2v3_sm3.cuh | 348 +++++++++++++++++++++++++++ lyra2/lyra2REv3.cu | 182 ++++++++++++++ miner.h | 3 + util.cpp | 3 + 15 files changed, 1221 insertions(+), 5 deletions(-) create mode 100644 lyra2/cuda_lyra2v3.cu create mode 100644 lyra2/cuda_lyra2v3_sm3.cuh create mode 100644 lyra2/lyra2REv3.cu diff --git a/Makefile.am b/Makefile.am index ecc8e30..4749f57 100644 --- a/Makefile.am +++ b/Makefile.am @@ -38,6 +38,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ lyra2/Lyra2.c lyra2/Sponge.c \ lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \ lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ + lyra2/lyra2REv3.cu lyra2/cuda_lyra2v3.cu \ lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \ lyra2/allium.cu \ Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ diff --git a/README.txt b/README.txt index 321bfb4..0ee3313 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccminer 2.3 "phi2 and cryptonight variants" +ccminer 2.3.1 "lyra2v3, exosis and sha256q" --------------------------------------------------------------- *************************************************************** @@ -100,7 +100,8 @@ its command line interface and options. lbry use to mine LBRY Credits luffa use to mine Joincoin lyra2 use to mine CryptoCoin - lyra2v2 use to mine Vertcoin + lyra2v2 use to mine Monacoin + lyra2v3 use to mine Vertcoin lyra2z use to mine Zerocoin (XZC) monero use to mine Monero (XMR) myr-gr use to mine Myriad-Groest @@ -117,7 +118,7 @@ its command line interface and options. scrypt-jane use to mine Chacha coins like Cache and Ultracoin s3 use to mine 1coin (ONE) sha256t use to mine OneCoin (OC) - sha256q use to mine Pyrite + sha256q use to mine Pyrite sia use to mine SIA sib use to mine Sibcoin skein use to mine Skeincoin diff --git a/algos.h b/algos.h index aa03ecd..e33d182 100644 --- a/algos.h +++ b/algos.h @@ -34,6 +34,7 @@ enum sha_algos { ALGO_LUFFA, ALGO_LYRA2, ALGO_LYRA2v2, + ALGO_LYRA2v3, ALGO_LYRA2Z, ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MYR_GR, @@ -115,6 +116,7 @@ static const char *algo_names[] = { "luffa", "lyra2", "lyra2v2", + "lyra2v3", "lyra2z", "mjollnir", "myr-gr", @@ -199,6 +201,8 @@ static inline int algo_to_int(char* arg) i = ALGO_LYRA2; else if (!strcasecmp("lyra2rev2", arg)) i = ALGO_LYRA2v2; + else if (!strcasecmp("lyra2rev3", arg)) + i = ALGO_LYRA2v3; else if (!strcasecmp("phi1612", arg)) i = ALGO_PHI; else if (!strcasecmp("bitcoin", arg)) diff --git a/bench.cpp b/bench.cpp index f674f77..d3c7701 100644 --- a/bench.cpp +++ b/bench.cpp @@ -78,6 +78,7 @@ void algo_free_all(int thr_id) free_luffa(thr_id); free_lyra2(thr_id); free_lyra2v2(thr_id); + free_lyra2v3(thr_id); free_lyra2Z(thr_id); free_myriad(thr_id); free_neoscrypt(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 596a924..2695074 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -269,7 +269,8 @@ Options:\n\ lbry LBRY Credits (Sha/Ripemd)\n\ luffa Joincoin\n\ lyra2 CryptoCoin\n\ - lyra2v2 VertCoin\n\ + lyra2v2 MonaCoin\n\ + lyra2v3 Vertcoin\n\ lyra2z ZeroCoin (3rd impl)\n\ myr-gr Myriad-Groestl\n\ monero XMR cryptonight (v7)\n\ @@ -1742,6 +1743,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_KECCAKC: case ALGO_LBRY: case ALGO_LYRA2v2: + case ALGO_LYRA2v3: case ALGO_LYRA2Z: case ALGO_PHI2: case ALGO_TIMETRAVEL: @@ -2283,6 +2285,7 @@ static void *miner_thread(void *userdata) case ALGO_JHA: case ALGO_HSR: case ALGO_LYRA2v2: + case ALGO_LYRA2v3: case ALGO_PHI: case ALGO_PHI2: case ALGO_POLYTIMOS: @@ -2474,6 +2477,9 @@ static void *miner_thread(void *userdata) case ALGO_LYRA2v2: rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_LYRA2v3: + rc = scanhash_lyra2v3(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_LYRA2Z: rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 5ef6551..26c9cd1 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -530,6 +530,9 @@ + + + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 8ed886a..3df7871 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -946,6 +946,15 @@ Source Files\CUDA\lyra2 + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + Source Files\CUDA\lyra2 diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index 030e89f..d110201 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -164,7 +164,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.3" +#define PACKAGE_VERSION "2.3.1" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c index 1f0a953..256af78 100644 --- a/lyra2/Lyra2.c +++ b/lyra2/Lyra2.c @@ -212,3 +212,176 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa return 0; } + +int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols) +{ + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + int64_t v64; // 64bit var for memcpy + uint64_t instance = 0; + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; + + size_t sz = (size_t)ROW_LEN_BYTES * nRows; + uint64_t *wholeMatrix = malloc(sz); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, sz); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + + byte *ptrByte = (byte*) wholeMatrix; + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen))); + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t state[16]; + initState(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) { + absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1; + do { + //Selects a pseudorandom index row* (the only change in REv3) + //------------------------------------------------------------------------------------------ + instance = state[instance & 0xF]; + rowa = state[instance & 0xF] & (unsigned int)(nRows-1); + + //rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock(state, memMatrix[rowa]); + + //Squeezes the key + squeeze(state, K, (unsigned int) kLen); + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + return 0; +} diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h index edf9179..f866462 100644 --- a/lyra2/Lyra2.h +++ b/lyra2/Lyra2.h @@ -38,5 +38,6 @@ typedef unsigned char byte; #endif int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); +int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); #endif /* LYRA2_H_ */ diff --git a/lyra2/cuda_lyra2v3.cu b/lyra2/cuda_lyra2v3.cu new file mode 100644 index 0000000..0278cab --- /dev/null +++ b/lyra2/cuda_lyra2v3.cu @@ -0,0 +1,481 @@ +/** + * Lyra2 (v3) CUDA Implementation + * + * Based on VTC sources + */ +#include +#include +#include +#include "cuda_helper.h" + +#include "cuda_lyra2v3_sm3.cuh" + + + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#endif + +#define TPB 32 + +#if __CUDA_ARCH__ >= 500 + +#include "cuda_lyra2_vectors.h" + +#define Nrow 4 +#define Ncol 4 +#define memshift 3 + + +__device__ uint2x4 *DMatrix; + +__device__ __forceinline__ uint2 LD4S(const int index) +{ + extern __shared__ uint2 shared_mem[]; + return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +} + +__device__ __forceinline__ void ST4S(const int index, const uint2 data) +{ + extern __shared__ uint2 shared_mem[]; + shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; +} + +__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ +void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; d ^= a; d = SWAPUINT2(d); + c += d; b ^= c; b = ROR2(b, 24); + a += b; d ^= a; d = ROR2(d, 16); + c += d; b ^= c; b = ROR2(b, 63); +} + +__device__ __forceinline__ +void round_lyra_v5(uint2x4 s[4]) +{ + Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z); +} + +__device__ __forceinline__ +void round_lyra_v5(uint2 s[4]) +{ + Gfunc_v5(s[0], s[1], s[2], s[3]); + s[1] = shuffle2(s[1], threadIdx.x + 1, 4); + s[2] = shuffle2(s[2], threadIdx.x + 2, 4); + s[3] = shuffle2(s[3], threadIdx.x + 3, 4); + Gfunc_v5(s[0], s[1], s[2], s[3]); + s[1] = shuffle2(s[1], threadIdx.x + 3, 4); + s[2] = shuffle2(s[2], threadIdx.x + 2, 4); + s[3] = shuffle2(s[3], threadIdx.x + 1, 4); +} + +__device__ __forceinline__ +void reduceDuplexRowSetup2(uint2 state[4]) +{ + uint2 state1[Ncol][3], state0[Ncol][3], state2[3]; + int i, j; + + #pragma unroll + for (int i = 0; i < Ncol; i++) + { + #pragma unroll + for (j = 0; j < 3; j++) + state0[Ncol - i - 1][j] = state[j]; + round_lyra_v5(state); + } + + //#pragma unroll 4 + for (i = 0; i < Ncol; i++) + { + #pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state0[i][j]; + + round_lyra_v5(state); + + #pragma unroll + for (j = 0; j < 3; j++) + state1[Ncol - i - 1][j] = state0[i][j]; + + #pragma unroll + for (j = 0; j < 3; j++) + state1[Ncol - i - 1][j] ^= state[j]; + } + + for (i = 0; i < Ncol; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift; + + #pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state1[i][j] + state0[i][j]; + + round_lyra_v5(state); + + #pragma unroll + for (j = 0; j < 3; j++) + state2[j] = state1[i][j]; + + #pragma unroll + for (j = 0; j < 3; j++) + state2[j] ^= state[j]; + + #pragma unroll + for (j = 0; j < 3; j++) + ST4S(s2 + j, state2[j]); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { + state0[i][0] ^= Data2; + state0[i][1] ^= Data0; + state0[i][2] ^= Data1; + } else { + state0[i][0] ^= Data0; + state0[i][1] ^= Data1; + state0[i][2] ^= Data2; + } + + #pragma unroll + for (j = 0; j < 3; j++) + ST4S(s0 + j, state0[i][j]); + + #pragma unroll + for (j = 0; j < 3; j++) + state0[i][j] = state2[j]; + + } + + for (i = 0; i < Ncol; i++) + { + const uint32_t s1 = memshift * Ncol * 1 + i*memshift; + const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift; + + #pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state1[i][j] + state0[Ncol - i - 1][j]; + + round_lyra_v5(state); + + #pragma unroll + for (j = 0; j < 3; j++) + state0[Ncol - i - 1][j] ^= state[j]; + + #pragma unroll + for (j = 0; j < 3; j++) + ST4S(s3 + j, state0[Ncol - i - 1][j]); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { + state1[i][0] ^= Data2; + state1[i][1] ^= Data0; + state1[i][2] ^= Data1; + } else { + state1[i][0] ^= Data0; + state1[i][1] ^= Data1; + state1[i][2] ^= Data2; + } + + #pragma unroll + for (j = 0; j < 3; j++) + ST4S(s1 + j, state1[i][j]); + } +} + +__device__ +void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4]) +{ + uint2 state1[3], state2[3]; + const uint32_t ps1 = memshift * Ncol * rowIn; + const uint32_t ps2 = memshift * Ncol * rowInOut; + const uint32_t ps3 = memshift * Ncol * rowOut; + + for (int i = 0; i < Ncol; i++) + { + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; + const uint32_t s3 = ps3 + i*memshift; + + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = LD4S(s1 + j); + + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(s2 + j); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra_v5(state); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s2 + j, state2[j]); + + #pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s3 + j, LD4S(s3 + j) ^ state[j]); + } +} + +__device__ +void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4]) +{ + const int rowIn = 2; + const int rowOut = 3; + + int i, j; + uint2 last[3]; + const uint32_t ps1 = memshift * Ncol * rowIn; + const uint32_t ps2 = memshift * Ncol * rowInOut; + + #pragma unroll + for (int j = 0; j < 3; j++) + last[j] = LD4S(ps2 + j); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= LD4S(ps1 + j) + last[j]; + + round_lyra_v5(state); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == rowOut) + { + #pragma unroll + for (j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (i = 1; i < Ncol; i++) + { + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; + + #pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= LD4S(s1 + j) + LD4S(s2 + j); + + round_lyra_v5(state); + } + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; +} + +__global__ +__launch_bounds__(TPB, 1) +void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + const uint2x4 blake2b_IV[2] = { + 0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL, + 0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL, + 0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL, + 0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL + }; + + const uint2x4 Mask[2] = { + 0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, + 0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL, + 0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL, + 0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL + }; + + uint2x4 state[4]; + + if (thread < threads) + { + state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i<12; i++) + round_lyra_v5(state); + + state[0] ^= Mask[0]; + state[1] ^= Mask[1]; + + for (int i = 0; i<12; i++) + round_lyra_v5(state); + + DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0]; + DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1]; + DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2]; + DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3]; + } +} + +__global__ +__launch_bounds__(TPB, 1) +void lyra2v3_gpu_hash_32_2(uint32_t threads) +{ + const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; + + if (thread < threads) + { + uint2 state[4]; + state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + + reduceDuplexRowSetup2(state); + + uint32_t rowa; + int prev = 3; + unsigned int instance = 0; + for (int i = 0; i < 3; i++) + { + instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4); + rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3; + + //rowa = __shfl(state[0].x, 0, 4) & 3; + reduceDuplexRowt2(prev, rowa, i, state); + prev = i; + } + + instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4); + rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3; + + //rowa = __shfl(state[0].x, 0, 4) & 3; + reduceDuplexRowt2x4(rowa, state); + + ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0]; + ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1]; + ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2]; + ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ +__launch_bounds__(TPB, 1) +void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + uint2x4 state[4]; + + if (thread < threads) + { + state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]); + state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]); + state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]); + state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra_v5(state); + + outputHash[thread + threads * 0] = state[0].x; + outputHash[thread + threads * 1] = state[0].y; + outputHash[thread + threads * 2] = state[0].z; + outputHash[thread + threads * 3] = state[0].w; + } +} + +#else +#include "cuda_helper.h" +#if __CUDA_ARCH__ < 200 +__device__ void* DMatrix; +#endif +__global__ void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {} +__global__ void lyra2v3_gpu_hash_32_2(uint32_t threads) {} +__global__ void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {} +#endif + + +__host__ +void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) +{ + cuda_get_arch(thr_id); + // just assign the device pointer allocated in main loop + cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); +} + +__host__ +void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order) +{ + int dev_id = device_map[thr_id % MAX_GPUS]; + + if (device_sm[dev_id] >= 500) { + + const uint32_t tpb = TPB; + + dim3 grid2((threads + tpb - 1) / tpb); + dim3 block2(tpb); + dim3 grid4((threads * 4 + tpb - 1) / tpb); + dim3 block4(4, tpb / 4); + + lyra2v3_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash); + lyra2v3_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads); + lyra2v3_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash); + + } else { + + uint32_t tpb = 16; + if (cuda_arch[dev_id] >= 350) tpb = TPB35; + else if (cuda_arch[dev_id] >= 300) tpb = TPB30; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; + + dim3 grid((threads + tpb - 1) / tpb); + dim3 block(tpb); + lyra2v3_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash); + + } +} + + diff --git a/lyra2/cuda_lyra2v3_sm3.cuh b/lyra2/cuda_lyra2v3_sm3.cuh new file mode 100644 index 0000000..f84521c --- /dev/null +++ b/lyra2/cuda_lyra2v3_sm3.cuh @@ -0,0 +1,348 @@ +/* SM 2/3/3.5 Variant for lyra2REv2 */ + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors, only uncomment that temporary, dont commit it */ +//#undef __CUDA_ARCH__ +//#define __CUDA_ARCH__ 500 +#endif + +#define TPB20 64 +#define TPB30 64 +#define TPB35 64 + +#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500 + +#include "cuda_lyra2_vectors.h" + +#define Nrow 4 +#define Ncol 4 + +#define vectype ulonglong4 +#define memshift 4 + +__device__ vectype *DMatrix; + +static __device__ __forceinline__ +void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) +{ + a += b; d ^= a; d = ROTR64(d, 32); + c += d; b ^= c; b = ROTR64(b, 24); + a += b; d ^= a; d = ROTR64(d, 16); + c += d; b ^= c; b = ROTR64(b, 63); +} + +static __device__ __forceinline__ +void round_lyra_v35(vectype* s) +{ + Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplexV3(vectype state[4], uint32_t thread) +{ + vectype state1[3]; + uint32_t ps1 = (Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); + + #pragma unroll 4 + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i *memshift; + uint32_t s2 = ps2 - Nrow * i *memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state1[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +{ + vectype state2[3], state1[3]; + + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); + + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow*i*memshift; + uint32_t s2 = ps2 + Nrow*i*memshift; + uint32_t s3 = ps3 - Nrow*i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1 )[j]); + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2 )[j]); + for (int j = 0; j < 3; j++) { + vectype tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix + s3)[j] = state1[j]; + } + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +{ + vectype state1[3], state2[3]; + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); + + #pragma nounroll + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i*memshift; + uint32_t s2 = ps2 + Nrow * i*memshift; + uint32_t s3 = ps3 + Nrow * i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + + for (int j = 0; j < 3; j++) + state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + if (rowInOut != rowOut) { + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s3)[j] ^= state[j]; + + } else { + + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } + } +} + +#if __CUDA_ARCH__ >= 300 +__global__ __launch_bounds__(TPB35, 1) +void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + vectype blake2b_IV[2]; + vectype padding[2]; + + if (threadIdx.x == 0) { + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c, + 0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0, + 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 + ); + } + + if (thread < threads) + { + ((uint2*)state)[0] = __ldg(&outputHash[thread]); + ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); + ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); + ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + + state[1] = state[0]; + state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); + state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= shuffle4(((vectype*)padding)[0], 0); + state[1] ^= shuffle4(((vectype*)padding)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + unsigned int instance = 0; + uint32_t rowa; + int prev = 3; + for (int i = 0; i < 4; i++) + { + //rowa = ((uint2*)state)[0].x & 3; + + instance = ((uint2*)state)[instance & 0xf].x; + rowa = ((uint2*)state)[instance & 0xf].x & 0x3; + reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + + } //thread +} +#elif __CUDA_ARCH__ >= 200 +__global__ __launch_bounds__(TPB20, 1) +void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + vectype blake2b_IV[2]; + vectype padding[2]; + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, + 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0, + 0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000 + ); + + if (thread < threads) + { + + ((uint2*)state)[0] = outputHash[thread]; + ((uint2*)state)[1] = outputHash[thread + threads]; + ((uint2*)state)[2] = outputHash[thread + 2 * threads]; + ((uint2*)state)[3] = outputHash[thread + 3 * threads]; + + state[1] = state[0]; + state[2] = ((vectype*)blake2b_IV)[0]; + state[3] = ((vectype*)blake2b_IV)[1]; + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= ((vectype*)padding)[0]; + state[1] ^= ((vectype*)padding)[1]; + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + uint instance = 0; + uint32_t rowa; + int prev = 3; + for (int i = 0; i < 4; i++) + { + // rowa = ((uint2*)state)[0].x & 3; + + instance = ((uint2*)state)[instance & 0xf]; + rowa = ((uint2*)state)[instance & 0xf] & 0x3; + reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + + } //thread +} +#endif + +#else +/* host & sm5+ */ +__global__ void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} +#endif diff --git a/lyra2/lyra2REv3.cu b/lyra2/lyra2REv3.cu new file mode 100644 index 0000000..21ad3cb --- /dev/null +++ b/lyra2/lyra2REv3.cu @@ -0,0 +1,182 @@ +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_cubehash.h" +#include "lyra2/Lyra2.h" +} + +#include +#include + +static uint64_t *d_hash[MAX_GPUS]; +static uint64_t* d_matrix[MAX_GPUS]; + +extern void blake256_cpu_init(int thr_id, uint32_t threads); +extern void blake256_cpu_setBlock_80(uint32_t *pdata); +extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); + +extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); + +extern void lyra2v3_setTarget(const void *pTargetIn); +extern void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix); +extern void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); + +extern void lyra2v3_cpu_hash_32_targ(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); + +extern void bmw256_setTarget(const void *ptarget); +extern void bmw256_cpu_init(int thr_id, uint32_t threads); +extern void bmw256_cpu_free(int thr_id); +extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); + +extern "C" void lyra2v3_hash(void *state, const void *input) +{ + uint32_t hashA[8], hashB[8]; + + sph_blake256_context ctx_blake; + sph_cubehash256_context ctx_cube; + sph_bmw256_context ctx_bmw; + + sph_blake256_set_rounds(14); + + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, input, 80); + sph_blake256_close(&ctx_blake, hashA); + + LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashB, 32); + sph_cubehash256_close(&ctx_cube, hashA); + + LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); + + sph_bmw256_init(&ctx_bmw); + sph_bmw256(&ctx_bmw, hashB, 32); + sph_bmw256_close(&ctx_bmw, hashA); + + memcpy(state, hashA, 32); +} + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20; + if (strstr(device_name[dev_id], "GTX 10")) intensity = 20; + uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ptarget[7] = 0x000f; + + + if (!init[thr_id]) + { + size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3; + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + blake256_cpu_init(thr_id, throughput); + bmw256_cpu_init(thr_id, throughput); + + cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256 + + // SM 3 implentation requires a bit more memory + if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500) + matrix_sz = 16 * sizeof(uint64_t) * 4 * 4; + + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); + lyra2v3_cpu_init(thr_id, throughput, d_matrix[thr_id]); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + + api_set_throughput(thr_id, throughput); + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + blake256_cpu_setBlock_80(pdata); + bmw256_setTarget(ptarget); + + do { + int order = 0; + + blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + memset(work->nonces, 0, sizeof(work->nonces)); + bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces); + + *hashes_done = pdata[19] - first_nonce + throughput; + + if (work->nonces[0] != 0) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + lyra2v3_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + lyra2v3_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart && !abort_flag); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_lyra2v3(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + cudaFree(d_matrix[thr_id]); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/miner.h b/miner.h index 7f52d55..1d75855 100644 --- a/miner.h +++ b/miner.h @@ -298,6 +298,7 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -372,6 +373,7 @@ extern void free_lbry(int thr_id); extern void free_luffa(int thr_id); extern void free_lyra2(int thr_id); extern void free_lyra2v2(int thr_id); +extern void free_lyra2v3(int thr_id); extern void free_lyra2Z(int thr_id); extern void free_myriad(int thr_id); extern void free_neoscrypt(int thr_id); @@ -929,6 +931,7 @@ void jha_hash(void *output, const void *input); void lbry_hash(void *output, const void *input); void lyra2re_hash(void *state, const void *input); void lyra2v2_hash(void *state, const void *input); +void lyra2v3_hash(void *state, const void *input); void lyra2Z_hash(void *state, const void *input); void myriadhash(void *state, const void *input); void neoscrypt(uchar *output, const uchar *input, uint32_t profile); diff --git a/util.cpp b/util.cpp index 79799b0..fca1b5c 100644 --- a/util.cpp +++ b/util.cpp @@ -2246,6 +2246,9 @@ void print_hash_tests(void) lyra2v2_hash(&hash[0], &buf[0]); printpfx("lyra2v2", hash); + lyra2v3_hash(&hash[0], &buf[0]); + printpfx("lyra2v3", hash); + lyra2Z_hash(&hash[0], &buf[0]); printpfx("lyra2z", hash); From 6ff4e50987e59a70056324a94ed8667cc0bf598d Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 30 Jan 2019 16:01:24 +0100 Subject: [PATCH 24/24] v2.3.1 release --- README.txt | 9 +++++++-- configure.ac | 2 +- lyra2/lyra2REv3.cu | 3 ++- res/ccminer.rc | 10 +++++----- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/README.txt b/README.txt index 0ee3313..c2470bd 100644 --- a/README.txt +++ b/README.txt @@ -52,8 +52,7 @@ SonoA (Sono) Tribus (JH, keccak, simd) Woodcoin (Double Skein) Vanilla (Blake256 8-rounds - double sha256) -Vertcoin Lyra2RE -Ziftrcoin (ZR5) +Vertcoin Lyra2REv3 Boolberry (Wild Keccak) Monero (Cryptonight v7 with -a monero) Aeon (Cryptonight-lite) @@ -291,6 +290,12 @@ so we can more efficiently implement new algorithms using the latest hardware features. >>> RELEASE HISTORY <<< + Jan. 30th 2019 v2.3.1 + Handle Lyra2v3 algo + Handle sha256q algo + Handle exosis algo + Handle blake2b standard algo + June 23th 2018 v2.3 Handle phi2 header variation for smart contracts Handle monero, stellite, graft and cryptolight variants diff --git a/configure.ac b/configure.ac index 9030e7e..6bb2209 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.3], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.3.1], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/lyra2/lyra2REv3.cu b/lyra2/lyra2REv3.cu index 21ad3cb..7e1b4a7 100644 --- a/lyra2/lyra2REv3.cu +++ b/lyra2/lyra2REv3.cu @@ -66,7 +66,8 @@ extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonc const uint32_t first_nonce = pdata[19]; int dev_id = device_map[thr_id]; int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20; - if (strstr(device_name[dev_id], "GTX 10")) intensity = 20; + if (strstr(device_name[dev_id], "GTX 1")) intensity = 20; + if (strstr(device_name[dev_id], "RTX 20")) intensity = 20; uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); diff --git a/res/ccminer.rc b/res/ccminer.rc index 18eb1d2..bc285bf 100644 --- a/res/ccminer.rc +++ b/res/ccminer.rc @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" // VS_VERSION_INFO VERSIONINFO - FILEVERSION 2,3,0,0 - PRODUCTVERSION 2,3,0,0 + FILEVERSION 2,3,1,0 + PRODUCTVERSION 2,3,1,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x21L @@ -76,10 +76,10 @@ BEGIN BEGIN BLOCK "040904e4" BEGIN - VALUE "FileVersion", "2.3" - VALUE "LegalCopyright", "Copyright (C) 2018" + VALUE "FileVersion", "2.3.1" + VALUE "LegalCopyright", "Copyright (C) 2019" VALUE "ProductName", "ccminer" - VALUE "ProductVersion", "2.3" + VALUE "ProductVersion", "2.3.1" END END BLOCK "VarFileInfo"