From 3d70026ae65dc6d3899fee3a9c25dd419c13d571 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 7 Mar 2017 16:26:27 +0100 Subject: [PATCH] hmq1725 algo Signed-off-by: Tanguy Pruvot --- Makefile.am | 2 +- algos.h | 4 + bench.cpp | 1 + ccminer.cpp | 7 + ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 3 + miner.h | 3 + quark/cuda_bmw512.cu | 8 +- quark/cuda_bmw512_sm3.cuh | 4 - util.cpp | 5 +- x17/cuda_x17_haval256.cu | 30 +-- x17/cuda_x17_sha512.cu | 11 +- x17/hmq17.cu | 542 ++++++++++++++++++++++++++++++++++++++ x17/x17.cu | 8 +- 14 files changed, 589 insertions(+), 40 deletions(-) create mode 100644 x17/hmq17.cu diff --git a/Makefile.am b/Makefile.am index 9b65f18..60a3cd8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -65,7 +65,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ x15/whirlpool.cu \ - x17/x17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ + x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu # scrypt diff --git a/algos.h b/algos.h index 687132c..e009403 100644 --- a/algos.h +++ b/algos.h @@ -19,6 +19,7 @@ enum sha_algos { ALGO_FUGUE256, /* Fugue256 */ ALGO_GROESTL, ALGO_HEAVY, /* Heavycoin hash */ + ALGO_HMQ1725, ALGO_KECCAK, ALGO_JACKPOT, ALGO_LBRY, @@ -73,6 +74,7 @@ static const char *algo_names[] = { "fugue256", "groestl", "heavy", + "hmq1725", "keccak", "jackpot", "lbry", @@ -135,6 +137,8 @@ static inline int algo_to_int(char* arg) i = ALGO_DMD_GR; else if (!strcasecmp("doom", arg)) i = ALGO_LUFFA; + else if (!strcasecmp("hmq17", arg)) + i = ALGO_HMQ1725; else if (!strcasecmp("lyra2re", arg)) i = ALGO_LYRA2; else if (!strcasecmp("lyra2rev2", arg)) diff --git a/bench.cpp b/bench.cpp index 4c5203d..3eb88ff 100644 --- a/bench.cpp +++ b/bench.cpp @@ -57,6 +57,7 @@ void algo_free_all(int thr_id) free_fugue256(thr_id); free_groestlcoin(thr_id); free_heavy(thr_id); + free_hmq17(thr_id); free_jackpot(thr_id); free_lbry(thr_id); free_luffa(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 95d9cd2..4330e3e 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -237,6 +237,7 @@ Options:\n\ fugue256 Fuguecoin\n\ groestl Groestlcoin\n\ heavy Heavycoin\n\ + hmq1725 Doubloons / Espers\n\ jackpot Jackpot\n\ keccak Keccak-256 (Maxcoin)\n\ lbry LBRY Credits (Sha/Ripemd)\n\ @@ -1600,6 +1601,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) opt_difficulty = 1.; switch (opt_algo) { + case ALGO_HMQ1725: // should be 256 but... suprnova... case ALGO_JACKPOT: case ALGO_NEOSCRYPT: case ALGO_SCRYPT: @@ -2232,12 +2234,17 @@ static void *miner_thread(void *userdata) rc = scanhash_myriad(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_HMQ1725: + rc = scanhash_hmq17(thr_id, &work, max_nonce, &hashes_done); + break; + case ALGO_HEAVY: rc = scanhash_heavy(thr_id, &work, max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ); break; case ALGO_MJOLLNIR: rc = scanhash_heavy(thr_id, &work, max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ); break; + case ALGO_KECCAK: rc = scanhash_keccak256(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 82684e8..84d9463 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -544,6 +544,7 @@ + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index d394b01..5514700 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -703,6 +703,9 @@ Source Files\CUDA\x17 + + Source Files\CUDA\x17 + Source Files\CUDA\x17 diff --git a/miner.h b/miner.h index c3253e2..80c5034 100644 --- a/miner.h +++ b/miner.h @@ -285,6 +285,7 @@ extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, extern int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen); extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -336,6 +337,7 @@ extern void free_fresh(int thr_id); extern void free_fugue256(int thr_id); extern void free_groestlcoin(int thr_id); extern void free_heavy(int thr_id); +extern void free_hmq17(int thr_id); extern void free_jackpot(int thr_id); extern void free_lbry(int thr_id); extern void free_luffa(int thr_id); @@ -852,6 +854,7 @@ void luffa_hash(void *state, const void *input); void fresh_hash(void *state, const void *input); void fugue256_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); +void hmq17hash(void *output, const void *input); void keccak256_hash(void *state, const void *input); unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index 8910ee8..6011beb 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -1,13 +1,11 @@ #include #include -#undef WANT_BMW512_80 +#define WANT_BMW512_80 #include "cuda_helper.h" -#ifdef WANT_BMW512_80 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) -#endif #include "cuda_bmw512_sm3.cuh" @@ -390,8 +388,6 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * } } -#ifdef WANT_BMW512_80 - __global__ __launch_bounds__(256, 2) void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { @@ -474,8 +470,6 @@ void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce quark_bmw512_gpu_hash_80_30<<>>(threads, startNounce, (uint64_t*)d_hash); } -#endif - __host__ void quark_bmw512_cpu_init(int thr_id, uint32_t threads) { diff --git a/quark/cuda_bmw512_sm3.cuh b/quark/cuda_bmw512_sm3.cuh index c0f4694..1298b13 100644 --- a/quark/cuda_bmw512_sm3.cuh +++ b/quark/cuda_bmw512_sm3.cuh @@ -226,8 +226,6 @@ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_ } } -#ifdef WANT_BMW512_80 - __global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { @@ -266,8 +264,6 @@ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_ } } -#endif - #else /* stripped stubs for other archs */ __global__ void quark_bmw512_gpu_hash_64_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {} __global__ void quark_bmw512_gpu_hash_80_30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} diff --git a/util.cpp b/util.cpp index 1ee3aa3..825f0c8 100644 --- a/util.cpp +++ b/util.cpp @@ -2116,7 +2116,7 @@ void do_gpu_tests(void) memset(work.data, 0, sizeof(work.data)); work.data[0] = 0; - scanhash_lbry(0, &work, 1, &done); + scanhash_hmq17(0, &work, 1, &done); free(work_restart); work_restart = NULL; @@ -2179,6 +2179,9 @@ void print_hash_tests(void) heavycoin_hash(&hash[0], &buf[0], 32); printpfx("heavy", hash); + hmq17hash(&hash[0], &buf[0]); + printpfx("hmq1725", hash); + jackpothash(&hash[0], &buf[0]); printpfx("jackpot", hash); diff --git a/x17/cuda_x17_haval256.cu b/x17/cuda_x17_haval256.cu index a7b9463..74b311e 100644 --- a/x17/cuda_x17_haval256.cu +++ b/x17/cuda_x17_haval256.cu @@ -259,14 +259,13 @@ } __global__ /* __launch_bounds__(256, 6) */ -void x17_haval256_gpu_hash_64(const uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x17_haval256_gpu_hash_64(const uint32_t threads, uint64_t *g_hash, const int outlen) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - uint64_t hashPosition = nounce - startNounce; - uint64_t *pHash = &g_hash[hashPosition*8U]; + const uint64_t hashPosition = thread*8U; + uint64_t *pHash = &g_hash[hashPosition]; uint32_t s0, s1, s2, s3, s4, s5, s6, s7; const uint32_t u0 = s0 = 0x243F6A88; @@ -288,7 +287,7 @@ void x17_haval256_gpu_hash_64(const uint32_t threads, uint32_t startNounce, uint hash.h8[i] = pHash[i]; } -///////// input big ///////////////////// + ///////// input big ///////////////////// uint32_t buf[32]; @@ -325,12 +324,13 @@ void x17_haval256_gpu_hash_64(const uint32_t threads, uint32_t startNounce, uint pHash[1] = hash.h8[1]; pHash[2] = hash.h8[2]; pHash[3] = hash.h8[3]; -#ifdef NEED_HASH_512 - pHash[4] = hash.h8[4]; - pHash[5] = hash.h8[5]; - pHash[6] = hash.h8[6]; - pHash[7] = hash.h8[7]; -#endif + + if (outlen == 512) { + pHash[4] = 0; //hash.h8[4]; + pHash[5] = 0; //hash.h8[5]; + pHash[6] = 0; //hash.h8[6]; + pHash[7] = 0; //hash.h8[7]; + } } } @@ -340,14 +340,12 @@ void x17_haval256_cpu_init(int thr_id, uint32_t threads) } __host__ -void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen) { const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x17_haval256_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - - //MyStreamSynchronize(NULL, order, thr_id); + x17_haval256_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash, outlen); } diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu index 6c97405..bebf17d 100644 --- a/x17/cuda_x17_sha512.cu +++ b/x17/cuda_x17_sha512.cu @@ -89,13 +89,12 @@ uint64_t Tone(uint64_t* K, uint64_t* r, uint64_t* W, const int a, const int i) __global__ /*__launch_bounds__(256, 4)*/ -void x17_sha512_gpu_hash_64(const uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +void x17_sha512_gpu_hash_64(const uint32_t threads, uint64_t *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - uint64_t hashPosition = nounce - startNounce; + const uint64_t hashPosition = thread; uint64_t *pHash = &g_hash[hashPosition*8U]; uint64_t W[80]; @@ -161,14 +160,12 @@ void x17_sha512_cpu_init(int thr_id, uint32_t threads) } __host__ -void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash) { const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x17_sha512_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - - //MyStreamSynchronize(NULL, order, thr_id); + x17_sha512_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash); } diff --git a/x17/hmq17.cu b/x17/hmq17.cu new file mode 100644 index 0000000..8fdbcdf --- /dev/null +++ b/x17/hmq17.cu @@ -0,0 +1,542 @@ +/** + * HMQ1725 algorithm + * @author tpruvot@github 02-2017 + */ + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" +#include "sph/sph_whirlpool.h" +#include "sph/sph_sha2.h" +#include "sph/sph_haval.h" +} + +#include +#include + +#include "x11/cuda_x11.h" + +static uint32_t *d_hash[MAX_GPUS]; +static uint32_t *d_hash_br2[MAX_GPUS]; +static uint32_t *d_tempBranch[MAX_GPUS]; + +extern void quark_bmw512_cpu_setBlock_80(void *pdata); +extern void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); + +extern void x11_luffa512_cpu_init(int thr_id, uint32_t threads); +extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); + +extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag); +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_free(int thr_id); + +extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); +extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); + +extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); +extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen); + +struct hmq_contexts +{ + sph_blake512_context blake1, blake2; + sph_bmw512_context bmw1, bmw2, bmw3; + sph_groestl512_context groestl1, groestl2; + sph_skein512_context skein1, skein2; + sph_jh512_context jh1, jh2; + sph_keccak512_context keccak1, keccak2; + sph_luffa512_context luffa1, luffa2; + sph_cubehash512_context cubehash; + sph_shavite512_context shavite1, shavite2; + sph_simd512_context simd1, simd2; + sph_echo512_context echo1, echo2; + sph_hamsi512_context hamsi; + sph_fugue512_context fugue1, fugue2; + sph_shabal512_context shabal; + sph_whirlpool_context whirlpool1, whirlpool2, whirlpool3, whirlpool4; + sph_sha512_context sha1, sha2; + sph_haval256_5_context haval1, haval2; +}; + +static __thread hmq_contexts base_contexts; +static __thread bool hmq_context_init = false; + +static void init_contexts(hmq_contexts *ctx) +{ + sph_bmw512_init(&ctx->bmw1); + sph_bmw512_init(&ctx->bmw2); + sph_bmw512_init(&ctx->bmw2); + sph_bmw512_init(&ctx->bmw3); + sph_whirlpool_init(&ctx->whirlpool1); + sph_whirlpool_init(&ctx->whirlpool2); + sph_whirlpool_init(&ctx->whirlpool3); + sph_whirlpool_init(&ctx->whirlpool4); + sph_groestl512_init(&ctx->groestl1); + sph_groestl512_init(&ctx->groestl2); + sph_skein512_init(&ctx->skein1); + sph_skein512_init(&ctx->skein2); + sph_jh512_init(&ctx->jh1); + sph_jh512_init(&ctx->jh2); + sph_keccak512_init(&ctx->keccak1); + sph_keccak512_init(&ctx->keccak2); + sph_blake512_init(&ctx->blake1); + sph_blake512_init(&ctx->blake2); + sph_luffa512_init(&ctx->luffa1); + sph_luffa512_init(&ctx->luffa2); + sph_cubehash512_init(&ctx->cubehash); + sph_shavite512_init(&ctx->shavite1); + sph_shavite512_init(&ctx->shavite2); + sph_simd512_init(&ctx->simd1); + sph_simd512_init(&ctx->simd2); + sph_echo512_init(&ctx->echo1); + sph_echo512_init(&ctx->echo2); + sph_hamsi512_init(&ctx->hamsi); + sph_fugue512_init(&ctx->fugue1); + sph_fugue512_init(&ctx->fugue2); + sph_shabal512_init(&ctx->shabal); + sph_sha512_init(&ctx->sha1); + sph_sha512_init(&ctx->sha2); + sph_haval256_5_init(&ctx->haval1); + sph_haval256_5_init(&ctx->haval2); +} + +// CPU Check +extern "C" void hmq17hash(void *output, const void *input) +{ + uint32_t _ALIGN(64) hash[32]; + + const uint32_t mask = 24; + + hmq_contexts ctx; + if (!hmq_context_init) { + init_contexts(&base_contexts); + hmq_context_init = true; + } + memcpy(&ctx, &base_contexts, sizeof(hmq_contexts)); + + sph_bmw512(&ctx.bmw1, input, 80); + sph_bmw512_close(&ctx.bmw1, hash); + + sph_whirlpool(&ctx.whirlpool1, hash, 64); + sph_whirlpool_close(&ctx.whirlpool1, hash); + + if (hash[0] & mask) { + sph_groestl512(&ctx.groestl1, hash, 64); + sph_groestl512_close(&ctx.groestl1, hash); + } else { + sph_skein512(&ctx.skein1, hash, 64); + sph_skein512_close(&ctx.skein1, hash); + } + + sph_jh512(&ctx.jh1, hash, 64); + sph_jh512_close(&ctx.jh1, hash); + sph_keccak512(&ctx.keccak1, hash, 64); + sph_keccak512_close(&ctx.keccak1, hash); + + if (hash[0] & mask) { + sph_blake512(&ctx.blake1, hash, 64); + sph_blake512_close(&ctx.blake1, hash); + } else { + sph_bmw512(&ctx.bmw2, hash, 64); + sph_bmw512_close(&ctx.bmw2, hash); + } + + sph_luffa512(&ctx.luffa1, hash, 64); + sph_luffa512_close(&ctx.luffa1, hash); + + sph_cubehash512(&ctx.cubehash, hash, 64); + sph_cubehash512_close(&ctx.cubehash, hash); + + if (hash[0] & mask) { + sph_keccak512(&ctx.keccak2, hash, 64); + sph_keccak512_close(&ctx.keccak2, hash); + } else { + sph_jh512(&ctx.jh2, hash, 64); + sph_jh512_close(&ctx.jh2, hash); + } + + sph_shavite512(&ctx.shavite1, hash, 64); + sph_shavite512_close(&ctx.shavite1, hash); + + sph_simd512(&ctx.simd1, hash, 64); + sph_simd512_close(&ctx.simd1, hash); + //applog_hash(hash); + + if (hash[0] & mask) { + sph_whirlpool(&ctx.whirlpool2, hash, 64); + sph_whirlpool_close(&ctx.whirlpool2, hash); + } else { + sph_haval256_5(&ctx.haval1, hash, 64); + sph_haval256_5_close(&ctx.haval1, hash); + memset(&hash[8], 0, 32); + } + + sph_echo512(&ctx.echo1, hash, 64); + sph_echo512_close(&ctx.echo1, hash); + + sph_blake512(&ctx.blake2, hash, 64); + sph_blake512_close(&ctx.blake2, hash); + //applog_hash(hash); + + if (hash[0] & mask) { + sph_shavite512(&ctx.shavite2, hash, 64); + sph_shavite512_close(&ctx.shavite2, hash); + } else { + sph_luffa512(&ctx.luffa2, hash, 64); + sph_luffa512_close(&ctx.luffa2, hash); + } + + sph_hamsi512(&ctx.hamsi, hash, 64); + sph_hamsi512_close(&ctx.hamsi, hash); + + sph_fugue512(&ctx.fugue1, hash, 64); + sph_fugue512_close(&ctx.fugue1, hash); + //applog_hash(hash); + + if (hash[0] & mask) { + sph_echo512(&ctx.echo2, hash, 64); + sph_echo512_close(&ctx.echo2, hash); + } else { + sph_simd512(&ctx.simd2, hash, 64); + sph_simd512_close(&ctx.simd2, hash); + } + + sph_shabal512(&ctx.shabal, hash, 64); + sph_shabal512_close(&ctx.shabal, hash); + + sph_whirlpool(&ctx.whirlpool3, hash, 64); + sph_whirlpool_close(&ctx.whirlpool3, hash); + //applog_hash(hash); + + if (hash[0] & mask) { + sph_fugue512(&ctx.fugue2, hash, 64); + sph_fugue512_close(&ctx.fugue2, hash); + } else { + sph_sha512(&ctx.sha1, hash, 64); + sph_sha512_close(&ctx.sha1, hash); + } + + sph_groestl512(&ctx.groestl2, hash, 64); + sph_groestl512_close(&ctx.groestl2, hash); + + sph_sha512(&ctx.sha2, hash, 64); + sph_sha512_close(&ctx.sha2, hash); + //applog_hash(hash); + + if (hash[0] & mask) { + sph_haval256_5(&ctx.haval2, hash, 64); + sph_haval256_5_close(&ctx.haval2, hash); + memset(&hash[8], 0, 32); + } else { + sph_whirlpool(&ctx.whirlpool4, hash, 64); + sph_whirlpool_close(&ctx.whirlpool4, hash); + } + //applog_hash(hash); + + sph_bmw512(&ctx.bmw3, hash, 64); + sph_bmw512_close(&ctx.bmw3, hash); + + memcpy(output, hash, 32); +} + +__global__ __launch_bounds__(128, 8) +void hmq_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t); + uint4 *psrc = (uint4*) (&d_hash[offset]); + d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 24U; + if (d_NonceBranch[thread]) return; + // uint4 = 4x uint32_t = 16 bytes + uint4 *pdst = (uint4*) (&d_branch2[offset]); + pdst[0] = psrc[0]; + pdst[1] = psrc[1]; + pdst[2] = psrc[2]; + pdst[3] = psrc[3]; + } +} + +__global__ __launch_bounds__(128, 8) +void hmq_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads && !d_NonceBranch[thread]) + { + const uint32_t offset = thread * 16U; + uint4 *pdst = (uint4*) (&d_hash[offset]); + uint4 *psrc = (uint4*) (&d_branch2[offset]); + pdst[0] = psrc[0]; + pdst[1] = psrc[1]; + pdst[2] = psrc[2]; + pdst[3] = psrc[3]; + } +} + +__host__ +uint32_t hmq_filter_cpu(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // extract algo permution hashes to a second branch buffer + hmq_filter_gpu <<>> (threads, inpHashes, d_branch2, d_tempBranch[thr_id]); + return threads; +} + +__host__ +void hmq_merge_cpu(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // put back second branch hashes to the common buffer d_hash + hmq_merge_gpu <<>> (threads, outpHashes, d_branch2, d_tempBranch[thr_id]); +} + +static bool init[MAX_GPUS] = { 0 }; + +//#define _DEBUG +#define _DEBUG_PREFIX "hmq-" +#include "cuda_debug.cuh" + +extern "C" int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8; + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x00ff; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", + throughput2intensity(throughput), throughput); + + quark_bmw512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_blake512_cpu_init(thr_id, throughput); + x11_luffaCubehash512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughput); + x17_haval256_cpu_init(thr_id, throughput); + x11_echo512_cpu_init(thr_id, throughput); + x11_shavite512_cpu_init(thr_id, throughput); + x11_luffa512_cpu_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x17_sha512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput), 0); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_tempBranch[thr_id], sizeof(uint32_t) * throughput), 0); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + int warn = 0; + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + quark_bmw512_cpu_setBlock_80(endiandata); + cuda_check_cpu_setTarget(ptarget); + + do { + int order = 0; + + // Hash with CUDA + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("bmw512 "); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("whirl "); + + hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("keccak "); + + hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + TRACE("cube "); + + hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("simd "); + + hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash_br2[thr_id], 512); order++; + hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("blake "); + + hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("fugue "); + + hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("whirl "); + + hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash_br2[thr_id]); order++; + hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 "); + + hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 512); order++; + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + TRACE("hav/wh "); + + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("bmw512 => "); + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + hmq17hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0 && work->nonces[1] != work->nonces[0]) { + be32enc(&endiandata[19], work->nonces[1]); + hmq17hash(vhash, endiandata); + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + } else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + } + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + // x11+ coins could do some random error, but not on retry + gpu_increment_reject(thr_id); + if (!warn) { + warn++; + pdata[19] = work->nonces[0] + 1; + continue; + } else { + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + warn = 0; + } + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_hmq17(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + cudaFree(d_hash_br2[thr_id]); + cudaFree(d_tempBranch[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x15_whirlpool_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + + cudaDeviceSynchronize(); + init[thr_id] = false; +} diff --git a/x17/x17.cu b/x17/x17.cu index b329b65..816e5e0 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -47,10 +47,10 @@ extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t sta extern void x15_whirlpool_cpu_free(int thr_id); extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); -extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); -extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen); // X17 CPU Hash (Validation) @@ -225,8 +225,8 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 256); order++; *hashes_done = pdata[19] - first_nonce + throughput;