From b31fb5316b26dcbbba167e47eb68d235cff60b61 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 29 Jul 2016 16:09:33 +0200 Subject: [PATCH] sia: cuda impl. and suprnova getwork over stratum Signed-off-by: Tanguy Pruvot --- Makefile.am | 1 + algos.h | 2 + bench.cpp | 2 + ccminer.cpp | 88 ++++++++++-- ccminer.vcxproj | 6 +- ccminer.vcxproj.filters | 18 ++- miner.h | 3 + sia.cu | 301 ++++++++++++++++++++++++++++++++++++++++ sph/blake2b.c | 196 ++++++++++++++++++++++++++ sph/blake2b.h | 41 ++++++ 10 files changed, 642 insertions(+), 16 deletions(-) create mode 100644 sia.cu create mode 100644 sph/blake2b.c create mode 100644 sph/blake2b.h diff --git a/Makefile.am b/Makefile.am index 8de4412..fd38065 100644 --- a/Makefile.am +++ b/Makefile.am @@ -46,6 +46,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \ neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \ cuda_nist5.cu pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \ + sia.cu sph/blake2b.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \ diff --git a/algos.h b/algos.h index 5184268..1e176f9 100644 --- a/algos.h +++ b/algos.h @@ -32,6 +32,7 @@ enum sha_algos { ALGO_QUBIT, ALGO_SCRYPT, ALGO_SCRYPT_JANE, + ALGO_SIA, ALGO_SIB, ALGO_SKEIN, ALGO_SKEIN2, @@ -81,6 +82,7 @@ static const char *algo_names[] = { "qubit", "scrypt", "scrypt-jane", + "sia", "sib", "skein", "skein2", diff --git a/bench.cpp b/bench.cpp index 2e6a1dc..f596a85 100644 --- a/bench.cpp +++ b/bench.cpp @@ -56,6 +56,7 @@ void algo_free_all(int thr_id) free_groestlcoin(thr_id); free_heavy(thr_id); free_jackpot(thr_id); + free_lbry(thr_id); free_luffa(thr_id); free_lyra2(thr_id); free_lyra2v2(thr_id); @@ -67,6 +68,7 @@ void algo_free_all(int thr_id) free_qubit(thr_id); free_skeincoin(thr_id); free_skein2(thr_id); + free_sia(thr_id); free_sib(thr_id); free_s3(thr_id); free_vanilla(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 2f4bbef..a69eb9b 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -570,6 +570,8 @@ static void calc_network_diff(struct work *work) uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]); if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]); if (opt_algo == ALGO_DECRED) nbits = work->data[29]; + if (opt_algo == ALGO_SIA) nbits = work->data[11]; // unsure if correct + uint32_t bits = (nbits & 0xffffff); int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 @@ -647,7 +649,7 @@ static bool work_decode(const json_t *val, struct work *work) stratum_diff = work->targetdiff; work->tx_count = use_pok = 0; - if (work->data[0] & POK_BOOL_MASK) { + if (opt_algo == ALGO_ZR5 && work->data[0] & POK_BOOL_MASK) { use_pok = 1; json_t *txs = json_object_get(val, "txs"); if (txs && json_is_array(txs)) { @@ -844,6 +846,10 @@ static bool submit_upstream_work(CURL *curl, struct work *work) le32enc(&ntime, work->data[25]); le32enc(&nonce, work->data[27]); break; + case ALGO_SIA: + be32enc(&ntime, work->data[10]); + be32enc(&nonce, work->data[8]); + break; case ALGO_ZR5: check_dups = true; be32enc(&ntime, work->data[17]); @@ -874,6 +880,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work) if (opt_algo == ALGO_DECRED) { xnonce2str = bin2hex((const uchar*)&work->data[36], stratum.xnonce1_size); + } else if (opt_algo == ALGO_SIA) { + uint16_t high_nonce = swab32(work->data[9]) >> 16; + xnonce2str = bin2hex((unsigned char*)(&high_nonce), 2); } else { xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len); } @@ -1394,6 +1403,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) /* Generate merkle root */ switch (opt_algo) { case ALGO_DECRED: + case ALGO_SIA: // getwork over stratum, no merkle to generate break; case ALGO_HEAVY: @@ -1446,7 +1456,6 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) sctx->xnonce1_size = sizeof(work->data)-(36*4); } memcpy(&work->data[36], sctx->xnonce1, sctx->xnonce1_size); - // work->data[36] = swab32(vote); // alt vote submission method work->data[37] = (rand()*4) << 8; // random work data sctx->job.height = work->data[32]; //applog_hex(work->data, 180); @@ -1458,6 +1467,18 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) work->data[25] = le32dec(sctx->job.ntime); work->data[26] = le32dec(sctx->job.nbits); work->data[28] = 0x80000000; + } else if (opt_algo == ALGO_SIA) { + uint32_t extra = 0; + memcpy(&extra, &sctx->job.coinbase[32], 2); + for (i = 0; i < 8; i++) // reversed hash + work->data[i] = ((uint32_t*)sctx->job.prevhash)[7-i]; + work->data[8] = 0; // nonce + work->data[9] = swab32(extra) | ((rand() << 8) & 0xffff); + work->data[10] = be32dec(sctx->job.ntime); + work->data[11] = be32dec(sctx->job.nbits); + memcpy(&work->data[12], sctx->job.coinbase, 32); // merkle_root + work->data[20] = 0x80000000; + if (opt_debug) applog_hex(work->data, 80); } else { for (i = 0; i < 8; i++) work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); @@ -1490,7 +1511,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) pthread_mutex_unlock(&stratum_work_lock); - if (opt_debug && opt_algo != ALGO_DECRED) { + if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA) { uint32_t utm = work->data[17]; if (opt_algo != ALGO_ZR5) utm = swab32(utm); char *tm = atime2str(utm - sctx->srvtime_diff); @@ -1673,11 +1694,19 @@ static void *miner_thread(void *userdata) uint32_t start_nonce; uint32_t scan_time = have_longpoll ? LP_SCANTIME : opt_scantime; uint64_t max64, minmax = 0x100000; + int nodata_check_oft = 0; + bool regen = false; // &work.data[19] int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76; - if (opt_algo == ALGO_LBRY) wcmplen = 108; int wcmpoft = 0; + + if (opt_algo == ALGO_LBRY) wcmplen = 108; + else if (opt_algo == ALGO_SIA) { + wcmpoft = (32+16)/4; + wcmplen = 32; + } + uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); if (have_stratum) { @@ -1698,7 +1727,14 @@ static void *miner_thread(void *userdata) nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); pthread_mutex_lock(&g_work_lock); extrajob |= work_done; - if (nonceptr[0] >= end_nonce || extrajob) { + + regen = (nonceptr[0] >= end_nonce); + if (opt_algo == ALGO_SIA) { + regen = (nonceptr[1] & 0xFF00 >= 0xF000); + } + regen = regen || extrajob; + + if (regen) { work_done = false; extrajob = false; if (stratum_gen_work(&stratum, &g_work)) @@ -1774,6 +1810,20 @@ static void *miner_thread(void *userdata) // and make an unique work (extradata) nonceptr[1] += 1; nonceptr[2] |= thr_id; + + } else if (opt_algo == ALGO_SIA) { + // suprnova job_id check without data/target/height change... + check_stratum_jobs = true; + if (check_stratum_jobs && strcmp(work.job_id, g_work.job_id)) { + pthread_mutex_unlock(&g_work_lock); + work_done = true; + continue; + } + nonceptr[1] += opt_n_threads; + nonceptr[1] |= thr_id; + // range max + nonceptr[0] = 0; + end_nonce = UINT32_MAX; } else if (opt_benchmark) { // randomize work nonceptr[-1] += 1; @@ -1796,11 +1846,14 @@ static void *miner_thread(void *userdata) } loopcnt++; - /* prevent gpu scans before a job is received */ - //if (opt_algo != ALGO_DECRED) // uncomment to allow testnet - if (have_stratum && work.data[0] == 0 && !opt_benchmark) { + // prevent gpu scans before a job is received + if (opt_algo == ALGO_SIA) nodata_check_oft = 7; // no stratum version + else if (opt_algo == ALGO_DECRED) nodata_check_oft = 4; // testnet ver is 0 + else nodata_check_oft = 0; + if (have_stratum && work.data[nodata_check_oft] == 0 && !opt_benchmark) { sleep(1); if (!thr_id) pools[cur_pooln].wait_time += 1; + gpulog(LOG_DEBUG, thr_id, "no data"); continue; } @@ -1931,6 +1984,7 @@ static void *miner_thread(void *userdata) case ALGO_KECCAK: case ALGO_LBRY: case ALGO_LUFFA: + case ALGO_SIA: case ALGO_SKEIN: case ALGO_SKEIN2: minmax = 0x1000000; @@ -2096,6 +2150,9 @@ static void *miner_thread(void *userdata) case ALGO_SKEIN2: rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_SIA: + rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_SIB: rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done); break; @@ -2152,8 +2209,9 @@ static void *miner_thread(void *userdata) gettimeofday(&tv_end, NULL); // todo: update all algos to use work->nonces - work.nonces[0] = nonceptr[0]; - if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S && opt_algo != ALGO_LBRY) { + if (opt_algo != ALGO_SIA) // reversed endian + work.nonces[0] = nonceptr[0]; + if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S && opt_algo != ALGO_LBRY && opt_algo != ALGO_SIA) { work.nonces[1] = nonceptr[2]; } @@ -2203,7 +2261,7 @@ static void *miner_thread(void *userdata) nonceptr[0] = UINT32_MAX; } - if (check_dups && opt_algo != ALGO_DECRED) + if (check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA) hashlog_remember_scan_range(&work); /* output */ @@ -2236,10 +2294,15 @@ static void *miner_thread(void *userdata) /* if nonce found, submit work */ if (rc > 0 && !opt_benchmark) { + uint32_t curnonce = nonceptr[0]; // current scan position + if (opt_led_mode == LED_MODE_SHARES) gpu_led_percent(dev_id, 50); + + nonceptr[0] = work.nonces[0]; if (!submit_work(mythr, &work)) break; + nonceptr[0] = curnonce; // prevent stale work in solo // we can't submit twice a block! @@ -2261,6 +2324,7 @@ static void *miner_thread(void *userdata) } if (!submit_work(mythr, &work)) break; + nonceptr[0] = curnonce; } } } @@ -3431,7 +3495,7 @@ int main(int argc, char *argv[]) cur_pooln = pool_get_first_valid(0); pool_switch(-1, cur_pooln); - if (opt_algo == ALGO_DECRED) { + if (opt_algo == ALGO_DECRED || opt_algo == ALGO_SIA) { allow_gbt = false; allow_mininginfo = false; } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index ce4c130..ce12aa7 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -115,7 +115,7 @@ 80 true true - compute_61,sm_61;compute_52,sm_52 + compute_52,sm_52 $(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99 64 @@ -266,6 +266,7 @@ + @@ -322,6 +323,8 @@ + + @@ -469,6 +472,7 @@ + 64 diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index e0d0351..6492674 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -129,6 +129,12 @@ Source Files\sph + + Source Files\sph + + + Source Files\sph + Source Files\sph @@ -264,9 +270,6 @@ Source Files - - Source Files\sph - Source Files @@ -314,6 +317,12 @@ Header Files\CUDA + + Header Files\sph + + + Header Files\sph + Header Files\sph @@ -664,6 +673,9 @@ Source Files\CUDA + + Source Files\CUDA + Source Files\CUDA diff --git a/miner.h b/miner.h index fa8335a..d32fc16 100644 --- a/miner.h +++ b/miner.h @@ -285,6 +285,7 @@ extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, uns extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -320,6 +321,7 @@ extern void free_fugue256(int thr_id); extern void free_groestlcoin(int thr_id); extern void free_heavy(int thr_id); extern void free_jackpot(int thr_id); +extern void free_lbry(int thr_id); extern void free_luffa(int thr_id); extern void free_lyra2(int thr_id); extern void free_lyra2v2(int thr_id); @@ -329,6 +331,7 @@ extern void free_nist5(int thr_id); extern void free_pentablake(int thr_id); extern void free_quark(int thr_id); extern void free_qubit(int thr_id); +extern void free_sia(int thr_id); extern void free_sib(int thr_id); extern void free_skeincoin(int thr_id); extern void free_skein2(int thr_id); diff --git a/sia.cu b/sia.cu new file mode 100644 index 0000000..1a5b681 --- /dev/null +++ b/sia.cu @@ -0,0 +1,301 @@ +/** + * Blake2-B CUDA Implementation + * + * tpruvot@github July 2016 + * + */ + +#include + +#include +#include + +#include + +#include +#include + +#define TPB 512 +#define NBN 2 + +static uint32_t *d_resNonces[MAX_GPUS]; + +__device__ uint64_t d_data[10]; + +static __constant__ const int8_t blake2b_sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 } , + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } +}; + +// host mem align +#define A 64 + +extern "C" void blake2b_hash(void *output, const void *input) +{ + uint8_t _ALIGN(A) hash[32]; + blake2b_ctx ctx; + + blake2b_init(&ctx, 32, NULL, 0); + blake2b_update(&ctx, input, 80); + blake2b_final(&ctx, hash); + + memcpy(output, hash, 32); +} + +// ---------------------------------------------------------------- + +__device__ __forceinline__ +static void G(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16]) +{ + a = a + b + m[ blake2b_sigma[r][2*i] ]; + ((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); + c = c + d; + ((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] ); + a = a + b + m[ blake2b_sigma[r][2*i+1] ]; + ((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); + c = c + d; + ((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); +} + +#define ROUND(r) \ + G(r, 0, v[0], v[4], v[ 8], v[12], m); \ + G(r, 1, v[1], v[5], v[ 9], v[13], m); \ + G(r, 2, v[2], v[6], v[10], v[14], m); \ + G(r, 3, v[3], v[7], v[11], v[15], m); \ + G(r, 4, v[0], v[5], v[10], v[15], m); \ + G(r, 5, v[1], v[6], v[11], v[12], m); \ + G(r, 6, v[2], v[7], v[ 8], v[13], m); \ + G(r, 7, v[3], v[4], v[ 9], v[14], m); + +// simplified for the last round +__device__ __forceinline__ +static void H(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16]) +{ + a = a + b + m[ blake2b_sigma[r][2*i] ]; + ((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); + c = c + d; + ((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] ); + a = a + b + m[ blake2b_sigma[r][2*i+1] ]; + ((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] ); + c = c + d; +} + +// we only check v[0] and v[8] +#define ROUND_F(r) \ + G(r, 0, v[0], v[4], v[ 8], v[12], m); \ + G(r, 1, v[1], v[5], v[ 9], v[13], m); \ + G(r, 2, v[2], v[6], v[10], v[14], m); \ + G(r, 3, v[3], v[7], v[11], v[15], m); \ + G(r, 4, v[0], v[5], v[10], v[15], m); \ + G(r, 5, v[1], v[6], v[11], v[12], m); \ + H(r, 6, v[2], v[7], v[ 8], v[13], m); + +__global__ +//__launch_bounds__(128, 8) /* to force 64 regs */ +void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2) +{ + const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce; + __shared__ uint64_t s_target; + if (!threadIdx.x) s_target = devectorize(target2); + + uint64_t m[16]; + + m[0] = d_data[0]; + m[1] = d_data[1]; + m[2] = d_data[2]; + m[3] = d_data[3]; + m[4] = d_data[4] | nonce; + m[5] = d_data[5]; + m[6] = d_data[6]; + m[7] = d_data[7]; + m[8] = d_data[8]; + m[9] = d_data[9]; + + m[10] = m[11] = 0; + m[12] = m[13] = m[14] = m[15] = 0; + + uint64_t v[16] = { + 0x6a09e667f2bdc928, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade68281, 0x9b05688c2b3e6c1f, 0xe07c265404be4294, 0x5be0cd19137e2179 + }; + + ROUND( 0 ); + ROUND( 1 ); + ROUND( 2 ); + ROUND( 3 ); + ROUND( 4 ); + ROUND( 5 ); + ROUND( 6 ); + ROUND( 7 ); + ROUND( 8 ); + ROUND( 9 ); + ROUND( 10 ); + ROUND_F( 11 ); + + uint64_t h64 = cuda_swab64(0x6a09e667f2bdc928 ^ v[0] ^ v[8]); + if (h64 <= s_target) { + resNonce[1] = resNonce[0]; + resNonce[0] = nonce; + s_target = h64; + } + // if (!nonce) printf("%016lx ", s_target); +} + +__host__ +uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce) +{ + uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX }; + uint32_t result = UINT32_MAX; + + dim3 grid((threads + TPB-1)/TPB); + dim3 block(TPB); + + /* Check error on Ctrl+C or kill to prevent segfaults on exit */ + if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess) + return result; + + blake2b_gpu_hash <<>> (threads, startNonce, d_resNonces[thr_id], target2); + cudaThreadSynchronize(); + + if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) { + result = resNonces[0]; + secNonce = resNonces[1]; + if (secNonce == result) secNonce = UINT32_MAX; + } + return result; +} + +__host__ +void blake2b_setBlock(uint32_t *data) +{ + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice)); +} + +static bool init[MAX_GPUS] = { 0 }; + +int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(A) hash[8]; + uint32_t _ALIGN(A) vhashcpu[8]; + uint32_t _ALIGN(A) inputdata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[8]; + + int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25; + if (device_sm[dev_id] >= 520 && is_windows()) intensity = 26; + if (device_sm[dev_id] < 350) intensity = 22; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage (linux) + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + //cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_LOG_ERROR(); + } + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1); + init[thr_id] = true; + } + + memcpy(inputdata, pdata, 80); + inputdata[11] = 0; // nbits + + const uint2 target = make_uint2(ptarget[6], ptarget[7]); + + blake2b_setBlock(inputdata); + + do { + uint32_t secNonce = UINT32_MAX; + uint32_t foundNonce = blake2b_hash_cuda(thr_id, throughput, pdata[8], target, secNonce); + + *hashes_done = pdata[8] - first_nonce + throughput; + + if (foundNonce != UINT32_MAX) + { + int res = 0; + inputdata[8] = foundNonce; + blake2b_hash(hash, inputdata); + if (swab32(hash[0]) <= Htarg) { + // sia hash target is reversed (start of hash) + swab256(vhashcpu, hash); + // applog_hex(vhashcpu, 32); + if (fulltest(vhashcpu, ptarget)) { + work_set_target_ratio(work, vhashcpu); + work->nonces[0] = foundNonce; + res ++; + } + } + + if (secNonce != UINT32_MAX) { + inputdata[8] = secNonce; + blake2b_hash(hash, inputdata); + if (swab32(hash[0]) <= Htarg) { + if (opt_debug) + gpulog(LOG_BLUE, thr_id, "found second nonce %08x", secNonce); + swab256(vhashcpu, hash); + if (fulltest(vhashcpu, ptarget)) { + work->nonces[1] = secNonce; + if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) { + work_set_target_ratio(work, vhashcpu); + xchg(work->nonces[0], work->nonces[1]); + } + res++; + } + } + } + if (res) { + pdata[8] = max_nonce; + return res; + } + } + + if ((uint64_t) throughput + pdata[8] >= max_nonce) { + pdata[8] = max_nonce; + break; + } + + pdata[8] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[8] - first_nonce; + + return 0; +} + +// cleanup +extern "C" void free_sia(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_resNonces[thr_id]); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/sph/blake2b.c b/sph/blake2b.c new file mode 100644 index 0000000..f85c977 --- /dev/null +++ b/sph/blake2b.c @@ -0,0 +1,196 @@ +/* + * Copyright 2009 Colin Percival, 2014 savale + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include +#include +#include + +#include "sph_types.h" +#include "blake2b.h" + +// Cyclic right rotation. + +#ifndef ROTR64 +#define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y)))) +#endif + +// Little-endian byte access. + +#define B2B_GET64(p) \ + (((uint64_t) ((uint8_t *) (p))[0]) ^ \ + (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \ + (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \ + (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \ + (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \ + (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \ + (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \ + (((uint64_t) ((uint8_t *) (p))[7]) << 56)) + +// G Mixing function. + +#define B2B_G(a, b, c, d, x, y) { \ + v[a] = v[a] + v[b] + x; \ + v[d] = ROTR64(v[d] ^ v[a], 32); \ + v[c] = v[c] + v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 24); \ + v[a] = v[a] + v[b] + y; \ + v[d] = ROTR64(v[d] ^ v[a], 16); \ + v[c] = v[c] + v[d]; \ + v[b] = ROTR64(v[b] ^ v[c], 63); } + +// Initialization Vector. + +static const uint64_t blake2b_iv[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 +}; + +// Compression function. "last" flag indicates last block. + +static void blake2b_compress(blake2b_ctx *ctx, int last) +{ + const uint8_t sigma[12][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + }; + int i; + uint64_t v[16], m[16]; + + for (i = 0; i < 8; i++) { // init work variables + v[i] = ctx->h[i]; + v[i + 8] = blake2b_iv[i]; + } + + v[12] ^= ctx->t[0]; // low 64 bits of offset + v[13] ^= ctx->t[1]; // high 64 bits + if (last) // last block flag set ? + v[14] = ~v[14]; + + for (i = 0; i < 16; i++) // get little-endian words + m[i] = B2B_GET64(&ctx->b[8 * i]); + + for (i = 0; i < 12; i++) { // twelve rounds + B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]); + B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]); + B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]); + B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]); + B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]); + B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]); + B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]); + B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]); + } + + for( i = 0; i < 8; ++i ) + ctx->h[i] ^= v[i] ^ v[i + 8]; +} + +// Initialize the hashing context "ctx" with optional key "key". +// 1 <= outlen <= 64 gives the digest size in bytes. +// Secret key (also <= 64 bytes) is optional (keylen = 0). + +int blake2b_init(blake2b_ctx *ctx, size_t outlen, + const void *key, size_t keylen) // (keylen=0: no key) +{ + size_t i; + + if (outlen == 0 || outlen > 64 || keylen > 64) + return -1; // illegal parameters + + for (i = 0; i < 8; i++) // state, "param block" + ctx->h[i] = blake2b_iv[i]; + ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen; + + ctx->t[0] = 0; // input count low word + ctx->t[1] = 0; // input count high word + ctx->c = 0; // pointer within buffer + ctx->outlen = outlen; + + for (i = keylen; i < 128; i++) // zero input block + ctx->b[i] = 0; + if (keylen > 0) { + blake2b_update(ctx, key, keylen); + ctx->c = 128; // at the end + } + + return 0; +} + +// Add "inlen" bytes from "in" into the hash. + +void blake2b_update(blake2b_ctx *ctx, + const void *in, size_t inlen) // data bytes +{ + size_t i; + + for (i = 0; i < inlen; i++) { + if (ctx->c == 128) { // buffer full ? + ctx->t[0] += ctx->c; // add counters + if (ctx->t[0] < ctx->c) // carry overflow ? + ctx->t[1]++; // high word + blake2b_compress(ctx, 0); // compress (not last) + ctx->c = 0; // counter to zero + } + ctx->b[ctx->c++] = ((const uint8_t *) in)[i]; + } +} + +// Generate the message digest (size given in init). +// Result placed in "out". + +void blake2b_final(blake2b_ctx *ctx, void *out) +{ + size_t i; + + ctx->t[0] += ctx->c; // mark last block offset + if (ctx->t[0] < ctx->c) // carry overflow + ctx->t[1]++; // high word + + while (ctx->c < 128) // fill up with zeros + ctx->b[ctx->c++] = 0; + blake2b_compress(ctx, 1); // final block flag = 1 + + // little endian convert and store + for (i = 0; i < ctx->outlen; i++) { + ((uint8_t *) out)[i] = + (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF; + } +} + diff --git a/sph/blake2b.h b/sph/blake2b.h new file mode 100644 index 0000000..f8652c1 --- /dev/null +++ b/sph/blake2b.h @@ -0,0 +1,41 @@ +#pragma once +#ifndef __BLAKE2B_H__ +#define __BLAKE2B_H__ + +#include +#include + +#if defined(_MSC_VER) +#include +#define inline __inline +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__((aligned(x))) +#endif + +#if defined(_MSC_VER) || defined(__x86_64__) || defined(__x86__) +#define NATIVE_LITTLE_ENDIAN +#endif + +// state context +ALIGN(64) typedef struct { + uint8_t b[128]; // input buffer + uint64_t h[8]; // chained state + uint64_t t[2]; // total number of bytes + size_t c; // pointer for b[] + size_t outlen; // digest size +} blake2b_ctx; + +#if defined(__cplusplus) +extern "C" { +#endif + +int blake2b_init(blake2b_ctx *ctx, size_t outlen, const void *key, size_t keylen); +void blake2b_update(blake2b_ctx *ctx, const void *in, size_t inlen); +void blake2b_final(blake2b_ctx *ctx, void *out); + +#if defined(__cplusplus) +} +#endif + +#endif