diff --git a/Makefile.am b/Makefile.am index 307d86e..683e21d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -21,6 +21,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ crc32.c hefty1.c \ ccminer.cpp pools.cpp util.cpp bench.cpp bignum.cpp \ api.cpp hashlog.cpp nvml.cpp stats.cpp sysinfos.cpp cuda.cpp \ + equi/equi-stratum.cpp equi/equi.cpp equi/blake2/blake2bx.cpp \ + equi/equihash.cpp equi/cuda_equi.cu \ heavy/heavy.cu \ heavy/cuda_blake512.cu heavy/cuda_blake512.h \ heavy/cuda_combine.cu heavy/cuda_combine.h \ @@ -97,6 +99,10 @@ ccminer_LDFLAGS += -L/usr/local/llvm/lib ccminer_LDADD += -lomp endif +#ccminer_CPPFLAGS += -DUSE_LIBSODIUM +#ccminer_LDFLAGS += -Lequi/lib +#ccminer_LDADD += -lsodium +ccminer_LDADD += -lcuda nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\" diff --git a/algos.h b/algos.h index e4dc4bb..163b193 100644 --- a/algos.h +++ b/algos.h @@ -16,6 +16,7 @@ enum sha_algos { ALGO_DEEP, ALGO_DECRED, ALGO_DMD_GR, + ALGO_EQUIHASH, ALGO_FRESH, ALGO_FUGUE256, /* Fugue256 */ ALGO_GROESTL, @@ -78,6 +79,7 @@ static const char *algo_names[] = { "deep", "decred", "dmd-gr", + "equihash", "fresh", "fugue256", "groestl", @@ -149,6 +151,8 @@ static inline int algo_to_int(char* arg) i = ALGO_C11; else if (!strcasecmp("diamond", arg)) i = ALGO_DMD_GR; + else if (!strcasecmp("equi", arg)) + i = ALGO_EQUIHASH; else if (!strcasecmp("doom", arg)) i = ALGO_LUFFA; else if (!strcasecmp("hmq17", arg)) diff --git a/api.cpp b/api.cpp index 0ec023f..8353127 100644 --- a/api.cpp +++ b/api.cpp @@ -1341,6 +1341,7 @@ void api_set_throughput(int thr_id, uint32_t throughput) if (thr_id < MAX_GPUS && thr_info) { struct cgpu_info *cgpu = &thr_info[thr_id].gpu; cgpu->intensity = throughput2intensity(throughput); + if (cgpu->throughput != throughput) cgpu->throughput = throughput; } // to display in bench results if (opt_benchmark) diff --git a/ccminer.cpp b/ccminer.cpp index 652a884..85ec68e 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -44,6 +44,7 @@ #include "algos.h" #include "sia/sia-rpc.h" #include "crypto/xmr-rpc.h" +#include "equi/equihash.h" #include @@ -243,6 +244,7 @@ Options:\n\ c11/flax X11 variant\n\ decred Decred Blake256\n\ deep Deepcoin\n\ + equihash Zcash Equihash\n\ dmd-gr Diamond-Groestl\n\ fresh Freshcoin (shavite 80)\n\ fugue256 Fuguecoin\n\ @@ -558,6 +560,14 @@ void get_currentalgo(char* buf, int sz) snprintf(buf, sz, "%s", algo_names[opt_algo]); } +void format_hashrate(double hashrate, char *output) +{ + if (opt_algo == ALGO_EQUIHASH) + format_hashrate_unit(hashrate, output, "Sol/s"); + else + format_hashrate_unit(hashrate, output, "H/s"); +} + /** * Exit app */ @@ -634,6 +644,10 @@ static void calc_network_diff(struct work *work) if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]); if (opt_algo == ALGO_DECRED) nbits = work->data[29]; if (opt_algo == ALGO_SIA) nbits = work->data[11]; // unsure if correct + if (opt_algo == ALGO_EQUIHASH) { + net_diff = equi_network_diff(work); + return; + } uint32_t bits = (nbits & 0xffffff); int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 @@ -856,6 +870,17 @@ static bool submit_upstream_work(CURL *curl, struct work *work) return true; } + if (pool->type & POOL_STRATUM && stratum.is_equihash) { + struct work submit_work; + memcpy(&submit_work, work, sizeof(struct work)); + //if (!hashlog_already_submittted(submit_work.job_id, submit_work.nonces[idnonce])) { + if (equi_stratum_submit(pool, &submit_work)) + hashlog_remember_submit(&submit_work, submit_work.nonces[idnonce]); + stratum.job.shares_count++; + //} + return true; + } + /* discard if a newer block was received */ stale_work = work->height && work->height < g_work.height; if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) { @@ -1508,6 +1533,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) /* Generate merkle root */ switch (opt_algo) { case ALGO_DECRED: + case ALGO_EQUIHASH: case ALGO_SIA: // getwork over stratum, no merkle to generate break; @@ -1566,6 +1592,13 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) memcpy(&work->data[44], &sctx->job.coinbase[sctx->job.coinbase_size-4], 4); sctx->job.height = work->data[32]; //applog_hex(work->data, 180); + } else if (opt_algo == ALGO_EQUIHASH) { + memcpy(&work->data[9], sctx->job.coinbase, 32+32); // merkle [9..16] + reserved + work->data[25] = le32dec(sctx->job.ntime); + work->data[26] = le32dec(sctx->job.nbits); + memcpy(&work->data[27], sctx->xnonce1, sctx->xnonce1_size & 0x1F); // pool extranonce + work->data[35] = 0x80; + //applog_hex(work->data, 140); } else if (opt_algo == ALGO_LBRY) { for (i = 0; i < 8; i++) work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); @@ -1618,7 +1651,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) pthread_mutex_unlock(&stratum_work_lock); - if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA) { + if (opt_debug && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA) { uint32_t utm = work->data[17]; if (opt_algo != ALGO_ZR5) utm = swab32(utm); char *tm = atime2str(utm - sctx->srvtime_diff); @@ -1656,6 +1689,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_LYRA2: work_set_target(work, sctx->job.diff / (128.0 * opt_difficulty)); break; + case ALGO_EQUIHASH: + equi_work_set_target(work, sctx->job.diff / opt_difficulty); + break; default: work_set_target(work, sctx->job.diff / opt_difficulty); } @@ -1830,6 +1866,9 @@ static void *miner_thread(void *userdata) } else if (opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_CRYPTONIGHT) { nonceptr = (uint32_t*) (((char*)work.data) + 39); wcmplen = 39; + } else if (opt_algo == ALGO_EQUIHASH) { + nonceptr = &work.data[EQNONCE_OFFSET]; // 27 is pool extranonce (256bits nonce space) + wcmplen = 4+32+32; } if (have_stratum) { @@ -1962,6 +2001,10 @@ static void *miner_thread(void *userdata) nonceptr[1] += 1; nonceptr[2] |= thr_id; + } else if (opt_algo == ALGO_EQUIHASH) { + nonceptr[1]++; + nonceptr[1] |= thr_id << 24; + //applog_hex(&work.data[27], 32); } else if (opt_algo == ALGO_WILDKECCAK) { //nonceptr[1] += 1; } else if (opt_algo == ALGO_SIA) { @@ -2264,6 +2307,9 @@ static void *miner_thread(void *userdata) case ALGO_DEEP: rc = scanhash_deep(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_EQUIHASH: + rc = scanhash_equihash(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_FRESH: rc = scanhash_fresh(thr_id, &work, max_nonce, &hashes_done); break; @@ -2431,7 +2477,7 @@ static void *miner_thread(void *userdata) work.nonces[1] = nonceptr[2]; } - if (stratum.rpc2 && rc == -EBUSY || work_restart[thr_id].restart) { + if (stratum.rpc2 && (rc == -EBUSY || work_restart[thr_id].restart)) { // bbr scratchpad download or stale result sleep(1); if (!thr_id) pools[cur_pooln].wait_time += 1; @@ -2489,7 +2535,7 @@ static void *miner_thread(void *userdata) } // only required to debug purpose - if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_SIA) + if (opt_debug && check_dups && opt_algo != ALGO_DECRED && opt_algo != ALGO_EQUIHASH && opt_algo != ALGO_SIA) hashlog_remember_scan_range(&work); /* output */ @@ -3854,6 +3900,10 @@ int main(int argc, char *argv[]) allow_mininginfo = false; } + if (opt_algo == ALGO_EQUIHASH) { + opt_extranonce = false; // disable subscribe + } + if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) { rpc2_init(); if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0"); diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 5aad65e..1316052 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -156,7 +156,7 @@ true true compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21 - --ptxas-options="-O2" %(AdditionalOptions) + --ptxas-options="-O2" --Wno-deprecated-gpu-targets %(AdditionalOptions) O2 @@ -199,9 +199,10 @@ true true compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_20,sm_21 - $(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99 + $(NVTOOLSEXT_PATH)\include O3 64 + --Wno-deprecated-gpu-targets %(AdditionalOptions) O3 @@ -235,6 +236,12 @@ + + StreamingSIMDExtensions + + + + @@ -254,6 +261,8 @@ + + @@ -283,6 +292,11 @@ 128 + + compute_52,sm_52;compute_50,sm_50;compute_30,sm_30 + -Xptxas -dlcm=ca -Xptxas -dscm=cs %(AdditionalOptions) + 0 + 160 @@ -552,7 +566,7 @@ - + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index f37da23..a6b39c4 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -103,6 +103,9 @@ {0f9aec5e-5409-488f-992a-2c108590d1ac} + + {031afae7-2a78-4e32-9738-4b589b6f7ff3} + @@ -321,6 +324,18 @@ Source Files\crypto\bbr + + Source Files\equi + + + Source Files\equi + + + Source Files\equi + + + Source Files\equi + @@ -563,6 +578,12 @@ Source Files\CUDA\xmr + + Source Files\equi + + + Source Files\equi + @@ -895,6 +916,9 @@ Source Files\CUDA + + Source Files\equi + @@ -911,4 +935,4 @@ Ressources - + \ No newline at end of file diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index 26cfea4..dfd973c 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -164,7 +164,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.0" +#define PACKAGE_VERSION "2.1" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/configure.ac b/configure.ac index 1c76b70..c92a6fc 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.0], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.1], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/equi/blake2/blake2-config.h b/equi/blake2/blake2-config.h new file mode 100644 index 0000000..2a8f9ed --- /dev/null +++ b/equi/blake2/blake2-config.h @@ -0,0 +1,81 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2_CONFIG_H__ +#define __BLAKE2_CONFIG_H__ + +// These don't work everywhere +#if (defined(__SSE2__) || defined(_M_AMD_64) || defined(_M_X64)) +#define HAVE_SSE2 +#endif + +#if defined(__SSSE3__) +#define HAVE_SSSE3 +#endif + +#if defined(__SSE4_1__) +#define HAVE_SSE41 +#endif + +#if defined(__AVX__) +#define HAVE_AVX +#endif + +#if defined(__XOP__) +#define HAVE_XOP +#endif + + +#ifdef HAVE_AVX2 +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_XOP +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_AVX +#ifndef HAVE_SSE41 +#define HAVE_SSE41 +#endif +#endif + +#ifdef HAVE_SSE41 +#ifndef HAVE_SSSE3 +#define HAVE_SSSE3 +#endif +#endif + +#ifdef HAVE_SSSE3 +#define HAVE_SSE2 +#endif + +#if !defined(HAVE_SSE2) + +#ifdef _MSC_VER +// enforce required stuff for now +#define HAVE_SSE2 +//#define HAVE_SSSE3 +#define HAVE_SSE41 +#else +# error "This code requires at least SSE 4.1" +#endif + +#endif + +#endif + diff --git a/equi/blake2/blake2-impl.h b/equi/blake2/blake2-impl.h new file mode 100644 index 0000000..16219db --- /dev/null +++ b/equi/blake2/blake2-impl.h @@ -0,0 +1,136 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2_IMPL_H__ +#define __BLAKE2_IMPL_H__ + +#include + +static inline uint32_t load32( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint32_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + uint32_t w = *p++; + w |= ( uint32_t )( *p++ ) << 8; + w |= ( uint32_t )( *p++ ) << 16; + w |= ( uint32_t )( *p++ ) << 24; + return w; +#endif +} + +static inline uint64_t load64( const void *src ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + uint64_t w; + memcpy(&w, src, sizeof w); + return w; +#else + const uint8_t *p = ( const uint8_t * )src; + uint64_t w = *p++; + w |= ( uint64_t )( *p++ ) << 8; + w |= ( uint64_t )( *p++ ) << 16; + w |= ( uint64_t )( *p++ ) << 24; + w |= ( uint64_t )( *p++ ) << 32; + w |= ( uint64_t )( *p++ ) << 40; + w |= ( uint64_t )( *p++ ) << 48; + w |= ( uint64_t )( *p++ ) << 56; + return w; +#endif +} + +static inline void store32( void *dst, uint32_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +#endif +} + +static inline void store64( void *dst, uint64_t w ) +{ +#if defined(NATIVE_LITTLE_ENDIAN) + memcpy(dst, &w, sizeof w); +#else + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +#endif +} + +static inline uint64_t load48( const void *src ) +{ + const uint8_t *p = ( const uint8_t * )src; + uint64_t w = *p++; + w |= ( uint64_t )( *p++ ) << 8; + w |= ( uint64_t )( *p++ ) << 16; + w |= ( uint64_t )( *p++ ) << 24; + w |= ( uint64_t )( *p++ ) << 32; + w |= ( uint64_t )( *p++ ) << 40; + return w; +} + +static inline void store48( void *dst, uint64_t w ) +{ + uint8_t *p = ( uint8_t * )dst; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; w >>= 8; + *p++ = ( uint8_t )w; +} + +static inline uint32_t rotl32( const uint32_t w, const unsigned c ) +{ + return ( w << c ) | ( w >> ( 32 - c ) ); +} + +static inline uint64_t rotl64( const uint64_t w, const unsigned c ) +{ + return ( w << c ) | ( w >> ( 64 - c ) ); +} + +static inline uint32_t rotr32( const uint32_t w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 32 - c ) ); +} + +static inline uint64_t rotr64( const uint64_t w, const unsigned c ) +{ + return ( w >> c ) | ( w << ( 64 - c ) ); +} + +/* prevents compiler optimizing out memset() */ +static inline void secure_zero_memory( void *v, size_t n ) +{ + volatile uint8_t *p = ( volatile uint8_t * )v; + while( n-- ) *p++ = 0; +} + +#endif + diff --git a/equi/blake2/blake2-round.h b/equi/blake2/blake2-round.h new file mode 100644 index 0000000..400ed20 --- /dev/null +++ b/equi/blake2/blake2-round.h @@ -0,0 +1,85 @@ +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + row1l = _mm_add_epi64(row1l, row2l); \ + row1h = _mm_add_epi64(row1h, row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + row1l = _mm_add_epi64(row1l, row2l); \ + row1h = _mm_add_epi64(row1h, row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); \ + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; + +#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); diff --git a/equi/blake2/blake2.h b/equi/blake2/blake2.h new file mode 100644 index 0000000..3206576 --- /dev/null +++ b/equi/blake2/blake2.h @@ -0,0 +1,85 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2_H__ +#define __BLAKE2_H__ + +#include +#include + +#if defined(_MSC_VER) +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__ ((__aligned__(x))) +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + + enum blake2b_constant + { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 + }; + +#pragma pack(push, 1) + typedef struct __blake2b_param + { + uint8_t digest_length; // 1 + uint8_t key_length; // 2 + uint8_t fanout; // 3 + uint8_t depth; // 4 + uint32_t leaf_length; // 8 + uint64_t node_offset; // 16 + uint8_t node_depth; // 17 + uint8_t inner_length; // 18 + uint8_t reserved[14]; // 32 + uint8_t salt[BLAKE2B_SALTBYTES]; // 48 + uint8_t personal[BLAKE2B_PERSONALBYTES]; // 64 + } blake2b_param; + + ALIGN( 64 ) typedef struct __blake2b_state + { + uint64_t h[8]; + uint8_t buf[BLAKE2B_BLOCKBYTES]; + uint16_t counter; + uint8_t buflen; + uint8_t lastblock; + } blake2b_state; + +#pragma pack(pop) + + int eq_blake2b_init( blake2b_state *S, const uint8_t outlen ); + int eq_blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen ); + int eq_blake2b_init_param( blake2b_state *S, const blake2b_param *P ); + int eq_blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen ); + int eq_blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen ); + + // Simple API + int eq_blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ); + + static inline int eq_blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) + { + return eq_blake2b( out, in, key, outlen, inlen, keylen ); + } + +#if defined(__cplusplus) +} +#endif + +#endif + diff --git a/equi/blake2/blake2b-load-sse2.h b/equi/blake2/blake2b-load-sse2.h new file mode 100644 index 0000000..1ba153c --- /dev/null +++ b/equi/blake2/blake2b-load-sse2.h @@ -0,0 +1,68 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2B_LOAD_SSE2_H__ +#define __BLAKE2B_LOAD_SSE2_H__ + +#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) +#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5) +#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2) +#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7) +#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1) +#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13) +#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2) +#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6) +#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8) +#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11) +#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15) +#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14) +#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14) +#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13) +#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9) +#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2) +#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12) +#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1) +#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8) +#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6) +#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11) +#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3) +#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1) +#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4) +#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7) +#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6) +#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3) +#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12) +#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) + + +#endif + diff --git a/equi/blake2/blake2b-load-sse41.h b/equi/blake2/blake2b-load-sse41.h new file mode 100644 index 0000000..f6c1bc8 --- /dev/null +++ b/equi/blake2/blake2b-load-sse41.h @@ -0,0 +1,402 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2B_LOAD_SSE41_H__ +#define __BLAKE2B_LOAD_SSE41_H__ + +#define LOAD_MSG_0_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_0_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_1_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_1_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_1_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_1_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#define LOAD_MSG_2_1(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m5, 8); \ +b1 = _mm_unpackhi_epi64(m2, m7); \ +} while(0) + + +#define LOAD_MSG_2_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m0); \ +b1 = _mm_blend_epi16(m1, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_2_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m5, m1, 0xF0); \ +b1 = _mm_unpackhi_epi64(m3, m4); \ +} while(0) + + +#define LOAD_MSG_2_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m3); \ +b1 = _mm_alignr_epi8(m2, m0, 8); \ +} while(0) + + +#define LOAD_MSG_3_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_unpackhi_epi64(m6, m5); \ +} while(0) + + +#define LOAD_MSG_3_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m0); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_3_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m2, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_3_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m5); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_4_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m2); \ +b1 = _mm_unpacklo_epi64(m1, m5); \ +} while(0) + + +#define LOAD_MSG_4_2(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m0, m3, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m7, m5, 0xF0); \ +b1 = _mm_blend_epi16(m3, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m0, 8); \ +b1 = _mm_blend_epi16(m4, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_5_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m3); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_5_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m5); \ +b1 = _mm_unpackhi_epi64(m5, m1); \ +} while(0) + + +#define LOAD_MSG_5_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m2, m3, 0xF0); \ +b1 = _mm_unpackhi_epi64(m7, m0); \ +} while(0) + + +#define LOAD_MSG_5_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m2); \ +b1 = _mm_blend_epi16(m7, m4, 0xF0); \ +} while(0) + + +#define LOAD_MSG_6_1(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m6, m0, 0xF0); \ +b1 = _mm_unpacklo_epi64(m7, m2); \ +} while(0) + + +#define LOAD_MSG_6_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_alignr_epi8(m5, m6, 8); \ +} while(0) + + +#define LOAD_MSG_6_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m3); \ +b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ +} while(0) + + +#define LOAD_MSG_6_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_blend_epi16(m1, m5, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m3); \ +b1 = _mm_blend_epi16(m6, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_2(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpackhi_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_7_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_unpacklo_epi64(m4, m1); \ +} while(0) + + +#define LOAD_MSG_7_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m2); \ +b1 = _mm_unpacklo_epi64(m3, m5); \ +} while(0) + + +#define LOAD_MSG_8_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m7); \ +b1 = _mm_alignr_epi8(m0, m5, 8); \ +} while(0) + + +#define LOAD_MSG_8_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_alignr_epi8(m4, m1, 8); \ +} while(0) + + +#define LOAD_MSG_8_3(b0, b1) \ +do \ +{ \ +b0 = m6; \ +b1 = _mm_alignr_epi8(m5, m0, 8); \ +} while(0) + + +#define LOAD_MSG_8_4(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m3, 0xF0); \ +b1 = m2; \ +} while(0) + + +#define LOAD_MSG_9_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_unpackhi_epi64(m3, m0); \ +} while(0) + + +#define LOAD_MSG_9_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m2); \ +b1 = _mm_blend_epi16(m3, m2, 0xF0); \ +} while(0) + + +#define LOAD_MSG_9_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_unpackhi_epi64(m1, m6); \ +} while(0) + + +#define LOAD_MSG_9_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpacklo_epi64(m6, m0); \ +} while(0) + + +#define LOAD_MSG_10_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_10_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_11_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_11_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_11_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_11_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#endif + diff --git a/equi/blake2/blake2b-round.h b/equi/blake2/blake2b-round.h new file mode 100644 index 0000000..b39106b --- /dev/null +++ b/equi/blake2/blake2b-round.h @@ -0,0 +1,170 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ +#pragma once +#ifndef __BLAKE2B_ROUND_H__ +#define __BLAKE2B_ROUND_H__ + +#define LOAD(p) _mm_load_si128( (const __m128i *)(p) ) +#define STORE(p,r) _mm_store_si128((__m128i *)(p), r) + +#define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) + +#define TOF(reg) _mm_castsi128_ps((reg)) +#define TOI(reg) _mm_castps_si128((reg)) + +#define LIKELY(x) __builtin_expect((x),1) + + +/* Microarchitecture-specific macros */ +#ifndef HAVE_XOP +#ifdef HAVE_SSSE3 +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) +#else +#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) )) +#endif +#else +/* ... */ +#endif + + + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, (-32)); \ + row4h = _mm_roti_epi64(row4h, (-32)); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, (-24)); \ + row2h = _mm_roti_epi64(row2h, (-24)); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, (-16)); \ + row4h = _mm_roti_epi64(row4h, (-16)); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, (-63)); \ + row2h = _mm_roti_epi64(row2h, (-63)); \ + +#if defined(HAVE_SSSE3) +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; +#else + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row4l;\ + t1 = row2l;\ + row4l = row3l;\ + row3l = row3h;\ + row3h = row4l;\ + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row3l;\ + row3l = row3h;\ + row3h = t0;\ + t0 = row2l;\ + t1 = row4l;\ + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) + +#endif + +#if defined(HAVE_SSE41) +#include "blake2b-load-sse41.h" +#else +#include "blake2b-load-sse2.h" +#endif + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); + +#endif + +#define BLAKE2_ROUND(row1l,row1h,row2l,row2h,row3l,row3h,row4l,row4h) \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + \ + DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + \ + UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); diff --git a/equi/blake2/blake2bx.cpp b/equi/blake2/blake2bx.cpp new file mode 100644 index 0000000..6f5c041 --- /dev/null +++ b/equi/blake2/blake2bx.cpp @@ -0,0 +1,262 @@ +/* + BLAKE2 reference source code package - optimized C implementations + + Written in 2012 by Samuel Neves + + To the extent possible under law, the author(s) have dedicated all copyright + and related and neighboring rights to this software to the public domain + worldwide. This software is distributed without any warranty. + + You should have received a copy of the CC0 Public Domain Dedication along with + this software. If not, see . +*/ + +#include +#include +#include + + +#include "blake2.h" +#include "blake2-impl.h" + +#include "blake2-config.h" + +#ifdef WIN32 +#include +#endif + +#include +#if defined(HAVE_SSSE3) +#include +#endif +#if defined(HAVE_SSE41) +#include +#endif +#if defined(HAVE_AVX) +#include +#endif +#if defined(HAVE_XOP) +#include +#endif + +#include "blake2b-round.h" + +ALIGN(64) static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +/* init xors IV with input parameter block */ +int eq_blake2b_init_param(blake2b_state *S, const blake2b_param *P) +{ + //blake2b_init0( S ); + const uint8_t * v = (const uint8_t *)(blake2b_IV); + const uint8_t * p = (const uint8_t *)(P); + uint8_t * h = (uint8_t *)(S->h); + /* IV XOR ParamBlock */ + memset(S, 0, sizeof(blake2b_state)); + + for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) h[i] = v[i] ^ p[i]; + + return 0; +} + +/* Some sort of default parameter block initialization, for sequential blake2b */ +int eq_blake2b_init(blake2b_state *S, const uint8_t outlen) +{ + if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1; + + const blake2b_param P = + { + outlen, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + { 0 }, + { 0 }, + { 0 } + }; + return eq_blake2b_init_param(S, &P); +} + +int eq_blake2b_init_key(blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen) +{ + if ((!outlen) || (outlen > BLAKE2B_OUTBYTES)) return -1; + + if ((!keylen) || keylen > BLAKE2B_KEYBYTES) return -1; + + const blake2b_param P = + { + outlen, + keylen, + 1, + 1, + 0, + 0, + 0, + 0, + { 0 }, + { 0 }, + { 0 } + }; + + if (eq_blake2b_init_param(S, &P) < 0) + return 0; + + { + uint8_t block[BLAKE2B_BLOCKBYTES]; + memset(block, 0, BLAKE2B_BLOCKBYTES); + memcpy(block, key, keylen); + eq_blake2b_update(S, block, BLAKE2B_BLOCKBYTES); + secure_zero_memory(block, BLAKE2B_BLOCKBYTES); /* Burn the key from stack */ + } + return 0; +} + +static inline int blake2b_compress(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES]) +{ + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; +#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) + const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); +#endif +#if defined(HAVE_SSE41) + const __m128i m0 = LOADU(block + 00); + const __m128i m1 = LOADU(block + 16); + const __m128i m2 = LOADU(block + 32); + const __m128i m3 = LOADU(block + 48); + const __m128i m4 = LOADU(block + 64); + const __m128i m5 = LOADU(block + 80); + const __m128i m6 = LOADU(block + 96); + const __m128i m7 = LOADU(block + 112); +#else + const uint64_t m0 = ( ( uint64_t * )block )[ 0]; + const uint64_t m1 = ( ( uint64_t * )block )[ 1]; + const uint64_t m2 = ( ( uint64_t * )block )[ 2]; + const uint64_t m3 = ( ( uint64_t * )block )[ 3]; + const uint64_t m4 = ( ( uint64_t * )block )[ 4]; + const uint64_t m5 = ( ( uint64_t * )block )[ 5]; + const uint64_t m6 = ( ( uint64_t * )block )[ 6]; + const uint64_t m7 = ( ( uint64_t * )block )[ 7]; + const uint64_t m8 = ( ( uint64_t * )block )[ 8]; + const uint64_t m9 = ( ( uint64_t * )block )[ 9]; + const uint64_t m10 = ( ( uint64_t * )block )[10]; + const uint64_t m11 = ( ( uint64_t * )block )[11]; + const uint64_t m12 = ( ( uint64_t * )block )[12]; + const uint64_t m13 = ( ( uint64_t * )block )[13]; + const uint64_t m14 = ( ( uint64_t * )block )[14]; + const uint64_t m15 = ( ( uint64_t * )block )[15]; +#endif + row1l = LOADU(&S->h[0]); + row1h = LOADU(&S->h[2]); + row2l = LOADU(&S->h[4]); + row2h = LOADU(&S->h[6]); + row3l = LOADU(&blake2b_IV[0]); + row3h = LOADU(&blake2b_IV[2]); + row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), _mm_set_epi32(0, 0, 0, S->counter)); + row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), _mm_set_epi32(0, 0, 0L - S->lastblock, 0L - S->lastblock)); + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); + STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); + STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); + return 0; +} + +int eq_blake2b_update(blake2b_state *S, const uint8_t *in, uint64_t inlen) +{ + while (inlen > 0) + { + size_t left = S->buflen; + size_t fill = BLAKE2B_BLOCKBYTES - left; + + if (inlen > fill) + { + memcpy(S->buf + left, in, fill); // Fill buffer + in += fill; + inlen -= fill; + S->counter += BLAKE2B_BLOCKBYTES; + blake2b_compress(S, S->buf); // Compress + S->buflen = 0; + } + else // inlen <= fill + { + memcpy(S->buf + left, in, inlen); + S->buflen += (uint8_t) inlen; // not enough to compress + in += inlen; + inlen = 0; + } + } + + return 0; +} + +int eq_blake2b_final(blake2b_state *S, uint8_t *out, uint8_t outlen) +{ + if (outlen > BLAKE2B_OUTBYTES) + return -1; + + if (S->buflen > BLAKE2B_BLOCKBYTES) + { + S->counter += BLAKE2B_BLOCKBYTES; + blake2b_compress(S, S->buf); + S->buflen -= BLAKE2B_BLOCKBYTES; + memcpy(S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen); + } + + S->counter += S->buflen; + S->lastblock = 1; + memset(S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ + blake2b_compress(S, S->buf); + memcpy(out, &S->h[0], outlen); + S->lastblock = 0; + return 0; +} + +int eq_blake2b(uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen) +{ + blake2b_state S[1]; + + /* Verify parameters */ + if (!in || !out) return -1; + if (NULL == key) keylen = 0; + + if (keylen) + { + if (eq_blake2b_init_key(S, outlen, key, keylen) < 0) return -1; + } + else + { + if (eq_blake2b_init(S, outlen) < 0) return -1; + } + + eq_blake2b_update(S, (const uint8_t *)in, inlen); + eq_blake2b_final(S, out, outlen); + return 0; +} diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu new file mode 100644 index 0000000..7a45ac8 --- /dev/null +++ b/equi/cuda_equi.cu @@ -0,0 +1,2117 @@ +/* + * Equihash solver created by djeZo (l33tsoftw@gmail.com) for NiceHash + * Adapted to be more compatible with older C++ compilers + * + * cuda_djezo solver was released by NiceHash (www.nicehash.com) under + * GPL 3.0 license. If you don't have a copy, you can obtain one from + * https://www.gnu.org/licenses/gpl-3.0.txt + * + * Based on CUDA solver by John Tromp released under MIT license. + * Some helper functions taken out of OpenCL solver by Marc Bevand + * released under MIT license. + * + * Copyright (c) 2016 John Tromp, Marc Bevand + * Copyright (c) 2017 djeZo, Tanguy Pruvot (GPL v3) + */ + +#ifdef WIN32 +#include +#endif + +#include +#include +//#include + +#include "equihash.h" +#include "eqcuda.hpp" // eq_cuda_context + +#include "blake2/blake2.h" + +//#define WN 200 +//#define WK 9 +#ifndef MAX_GPUS +#define MAX_GPUS 16 +#endif + +#define NDIGITS (WK+1) +#define DIGITBITS (WN/(NDIGITS)) +#define PROOFSIZE (1< +#include +#define __launch_bounds__(max_tpb, min_blocks) +#define __CUDA_ARCH__ 520 +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); +uint32_t atomicExch(uint32_t *x, uint32_t y); +uint32_t atomicAdd(uint32_t *x, uint32_t y); +void __syncthreads(void); +void __threadfence(void); +void __threadfence_block(void); +uint32_t __ldg(const uint32_t* address); +uint64_t __ldg(const uint64_t* address); +uint4 __ldca(const uint4 *ptr); +u32 __ldca(const u32 *ptr); +u32 umin(const u32, const u32); +u32 umax(const u32, const u32); +#endif + +typedef u32 proof[PROOFSIZE]; + +struct __align__(32) slot { + u32 hash[8]; +}; + +struct __align__(16) slotsmall { + u32 hash[4]; +}; + +struct __align__(8) slottiny { + u32 hash[2]; +}; + +template +struct equi +{ + slot round0trees[4096][RB8_NSLOTS]; + slot trees[1][NBUCKETS][NSLOTS]; + struct { + slotsmall treessmall[NSLOTS]; + slottiny treestiny[NSLOTS]; + } round2trees[NBUCKETS]; + struct { + slotsmall treessmall[NSLOTS]; + slottiny treestiny[NSLOTS]; + } round3trees[NBUCKETS]; + slotsmall treessmall[4][NBUCKETS][NSLOTS]; + slottiny treestiny[1][4096][RB8_NSLOTS_LD]; + u32 round4bidandsids[NBUCKETS][NSLOTS]; + union { + u64 blake_h[8]; + u32 blake_h32[16]; + }; + struct { + u32 nslots8[4096]; + u32 nslots0[4096]; + u32 nslots[9][NBUCKETS]; + scontainerreal srealcont; + } edata; +}; + +// todo: use cuda_helper.h and/or cuda_vector.h +__device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) +{ + return make_uint2(a.x ^ b.x, a.y ^ b.y); +} + +__device__ __forceinline__ uint4 operator^ (uint4 a, uint4 b) +{ + return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} + +// for ROR 63 (or ROL 1); this func only support (32 <= offset < 64) +__device__ __forceinline__ uint2 ROR2(const uint2 a, const int offset) +{ + uint2 result; +#if __CUDA_ARCH__ > 300 + { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } +#else + result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); + result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); +#endif + return result; +} + + +__device__ __forceinline__ uint2 SWAPUINT2(uint2 value) +{ + return make_uint2(value.y, value.x); +} + +__device__ __forceinline__ uint2 ROR24(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x6543); + return result; +} + +__device__ __forceinline__ uint2 ROR16(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x5432); + return result; +} + +__device__ __forceinline__ void G2(u64 & a, u64 & b, u64 & c, u64 & d, u64 x, u64 y) +{ + a = a + b + x; + ((uint2*)&d)[0] = SWAPUINT2(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); + c = c + d; + ((uint2*)&b)[0] = ROR24(((uint2*)&b)[0] ^ ((uint2*)&c)[0]); + a = a + b + y; + ((uint2*)&d)[0] = ROR16(((uint2*)&d)[0] ^ ((uint2*)&a)[0]); + c = c + d; + ((uint2*)&b)[0] = ROR2(((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U); +} + +// untested.. +struct packer_default +{ + __device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM) + { + return (((bucketid << SLOTBITS) | s0) << SLOTBITS) | s1; + } + + __device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM) + { + // BUCKMASK-ed to prevent illegal memory accesses in case of memory errors + return (bid >> (2 * SLOTBITS)) & BUCKMASK; + } + + __device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM) + { + return bid & SLOTMASK; + } + + __device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM) + { + return (bid >> SLOTBITS) & SLOTMASK; + } +}; + + +struct packer_cantor +{ + __device__ __forceinline__ static u32 cantor(const u32 s0, const u32 s1) + { + u32 a = umax(s0, s1); + u32 b = umin(s0, s1); + return a * (a + 1) / 2 + b; + } + + __device__ __forceinline__ static u32 set_bucketid_and_slots(const u32 bucketid, const u32 s0, const u32 s1, const u32 RB, const u32 SM) + { + return (bucketid << CANTORBITS) | cantor(s0, s1); + } + + __device__ __forceinline__ static u32 get_bucketid(const u32 bid, const u32 RB, const u32 SM) + { + return (bid >> CANTORBITS) & BUCKMASK; + } + + __device__ __forceinline__ static u32 get_slot0(const u32 bid, const u32 s1, const u32 RB, const u32 SM) + { + return ((bid & CANTORMASK) - cantor(0, s1)) & SLOTMASK; + } + + __device__ __forceinline__ static u32 get_slot1(const u32 bid, const u32 RB, const u32 SM) + { + u32 k, q, sqr = 8 * (bid & CANTORMASK) + 1; + // this k=sqrt(sqr) computing loop averages 3.4 iterations out of maximum 9 + for (k = CANTORMAXSQRT; (q = sqr / k) < k; k = (k + q) / 2); + return ((k - 1) / 2) & SLOTMASK; + } +}; + +__device__ __constant__ const u64 blake_iv[] = { + 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, + 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1, + 0x510e527fade682d1, 0x9b05688c2b3e6c1f, + 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179, +}; + +#if CUDART_VERSION < 8000 || !defined(__ldca) +#define __ldca(ptr) *(ptr) +#endif + +template +__global__ void digit_first(equi* eq, u32 nonce) +{ + const u32 block = blockIdx.x * blockDim.x + threadIdx.x; + __shared__ u64 hash_h[8]; + u32* hash_h32 = (u32*)hash_h; + + if (threadIdx.x < 16) + hash_h32[threadIdx.x] = __ldca(&eq->blake_h32[threadIdx.x]); + + __syncthreads(); + + u64 m = (u64)block << 32 | (u64)nonce; + + union + { + u64 v[16]; + u32 v32[32]; + uint4 v128[8]; + }; + + v[0] = hash_h[0]; + v[1] = hash_h[1]; + v[2] = hash_h[2]; + v[3] = hash_h[3]; + v[4] = hash_h[4]; + v[5] = hash_h[5]; + v[6] = hash_h[6]; + v[7] = hash_h[7]; + v[8] = blake_iv[0]; + v[9] = blake_iv[1]; + v[10] = blake_iv[2]; + v[11] = blake_iv[3]; + v[12] = blake_iv[4] ^ (128 + 16); + v[13] = blake_iv[5]; + v[14] = blake_iv[6] ^ 0xffffffffffffffff; + v[15] = blake_iv[7]; + + // mix 1 + G2(v[0], v[4], v[8], v[12], 0, m); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 2 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], m, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 3 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, m); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 4 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, m); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 5 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, m); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 6 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], m, 0); + + // mix 7 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], m, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 8 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, m); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 9 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], m, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 10 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], m, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 11 + G2(v[0], v[4], v[8], v[12], 0, m); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], 0, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + // mix 12 + G2(v[0], v[4], v[8], v[12], 0, 0); + G2(v[1], v[5], v[9], v[13], 0, 0); + G2(v[2], v[6], v[10], v[14], 0, 0); + G2(v[3], v[7], v[11], v[15], 0, 0); + G2(v[0], v[5], v[10], v[15], m, 0); + G2(v[1], v[6], v[11], v[12], 0, 0); + G2(v[2], v[7], v[8], v[13], 0, 0); + G2(v[3], v[4], v[9], v[14], 0, 0); + + v[0] ^= hash_h[0] ^ v[8]; + v[1] ^= hash_h[1] ^ v[9]; + v[2] ^= hash_h[2] ^ v[10]; + v[3] ^= hash_h[3] ^ v[11]; + v[4] ^= hash_h[4] ^ v[12]; + v[5] ^= hash_h[5] ^ v[13]; + v32[12] ^= hash_h32[12] ^ v32[28]; + + u32 bexor = __byte_perm(v32[0], 0, 0x4012); // first 20 bits + u32 bucketid; + asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor)); + u32 slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1); + if (slotp < RB8_NSLOTS) + { + slot* s = &eq->round0trees[bucketid][slotp]; + + uint4 tt; + tt.x = __byte_perm(v32[0], v32[1], 0x1234); + tt.y = __byte_perm(v32[1], v32[2], 0x1234); + tt.z = __byte_perm(v32[2], v32[3], 0x1234); + tt.w = __byte_perm(v32[3], v32[4], 0x1234); + *(uint4*)(&s->hash[0]) = tt; + + tt.x = __byte_perm(v32[4], v32[5], 0x1234); + tt.y = __byte_perm(v32[5], v32[6], 0x1234); + tt.z = 0; + tt.w = block << 1; + *(uint4*)(&s->hash[4]) = tt; + } + + bexor = __byte_perm(v32[6], 0, 0x0123); + asm("bfe.u32 %0, %1, 12, 12;" : "=r"(bucketid) : "r"(bexor)); + slotp = atomicAdd(&eq->edata.nslots0[bucketid], 1); + if (slotp < RB8_NSLOTS) + { + slot* s = &eq->round0trees[bucketid][slotp]; + + uint4 tt; + tt.x = __byte_perm(v32[6], v32[7], 0x2345); + tt.y = __byte_perm(v32[7], v32[8], 0x2345); + tt.z = __byte_perm(v32[8], v32[9], 0x2345); + tt.w = __byte_perm(v32[9], v32[10], 0x2345); + *(uint4*)(&s->hash[0]) = tt; + + tt.x = __byte_perm(v32[10], v32[11], 0x2345); + tt.y = __byte_perm(v32[11], v32[12], 0x2345); + tt.z = 0; + tt.w = (block << 1) + 1; + *(uint4*)(&s->hash[4]) = tt; + } +} + +/* + Functions digit_1 to digit_8 works by the same principle; + Each thread does 2-3 slot loads (loads are coalesced). + Xorwork of slots is loaded into shared memory and is kept in registers (except for digit_1). + At the same time, restbits (8 or 9 bits) in xorwork are used for collisions. + Restbits determine position in ht. + Following next is pair creation. First one (or two) pairs' xorworks are put into global memory + as soon as possible, the rest pairs are saved in shared memory (one u32 per pair - 16 bit indices). + In most cases, all threads have one (or two) pairs so with this trick, we offload memory writes a bit in last step. + In last step we save xorwork of pairs in memory. +*/ +template +__global__ void digit_1(equi* eq) +{ + __shared__ u16 ht[256][SSM - 1]; + __shared__ uint2 lastword1[RB8_NSLOTS]; + __shared__ uint4 lastword2[RB8_NSLOTS]; + __shared__ int ht_len[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < 256) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots0[bucketid], RB8_NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + uint2 ta[2]; + uint4 tb[2]; + + u32 si[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + + #pragma unroll + for (u32 i = 0; i != 2; ++i) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + const slot* pslot1 = eq->round0trees[bucketid] + si[i]; + + // get xhash + uint4 a1 = *(uint4*)(&pslot1->hash[0]); + uint2 a2 = *(uint2*)(&pslot1->hash[4]); + ta[i].x = a1.x; + ta[i].y = a1.y; + lastword1[si[i]] = ta[i]; + tb[i].x = a1.z; + tb[i].y = a1.w; + tb[i].z = a2.x; + tb[i].w = a2.y; + lastword2[si[i]] = tb[i]; + + asm("bfe.u32 %0, %1, 20, 8;" : "=r"(hr[i]) : "r"(ta[i].x)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + int* pairs = ht_len; + + u32 xors[6]; + u32 xorbucketid, xorslot; + + #pragma unroll + for (u32 i = 0; i != 2; ++i) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = ta[i] ^ lastword1[p]; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[2]) = lastword2[si[i]] ^ lastword2[p]; + + slot &xs = eq->trees[0][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + uint4 ttx; + ttx.x = xors[5]; + ttx.y = xors[0]; + ttx.z = packer_default::set_bucketid_and_slots(bucketid, si[i], p, 8, RB8_NSLOTS); + ttx.w = 0; + *(uint4*)(&xs.hash[4]) = ttx; + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[1][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[2]) = lastword2[i] ^ lastword2[k]; + + slot &xs = eq->trees[0][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + uint4 ttx; + ttx.x = xors[5]; + ttx.y = xors[0]; + ttx.z = packer_default::set_bucketid_and_slots(bucketid, i, k, 8, RB8_NSLOTS); + ttx.w = 0; + *(uint4*)(&xs.hash[4]) = ttx; + } + } +} + + +template +__global__ void digit_2(equi* eq) +{ + __shared__ u16 ht[NRESTS][SSM - 1]; + __shared__ u32 lastword1[NSLOTS]; + __shared__ uint4 lastword2[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + slot* buck = eq->trees[0][bucketid]; + u32 bsize = umin(eq->edata.nslots[1][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 ta[2]; + uint4 tt[2]; + + u32 si[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + // get slot + const slot* pslot1 = buck + si[i]; + + uint4 ttx = *(uint4*)(&pslot1->hash[0]); + lastword1[si[i]] = ta[i] = ttx.x; + uint2 tty = *(uint2*)(&pslot1->hash[4]); + tt[i].x = ttx.y; + tt[i].y = ttx.z; + tt[i].z = ttx.w; + tt[i].w = tty.x; + lastword2[si[i]] = tt[i]; + + hr[i] = tty.y & RESTMASK; + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[5]; + u32 xorbucketid, xorslot; + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[0] = ta[i] ^ lastword1[p]; + + xorbucketid = xors[0] >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1); + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[1]) = tt[i] ^ lastword2[p]; + slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = xors[4]; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + xors[0] = lastword1[i] ^ lastword1[k]; + + xorbucketid = xors[0] >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[2][xorbucketid], 1); + if (xorslot < NSLOTS) + { + *(uint4*)(&xors[1]) = lastword2[i] ^ lastword2[k]; + slotsmall &xs = eq->round2trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + slottiny &xst = eq->round2trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = xors[4]; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } +} + + +template +__global__ void digit_3(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword1[NSLOTS]; + __shared__ u32 lastword2[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots[2][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; + u32 ta[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + slotsmall &xs = eq->round2trees[bucketid].treessmall[si[i]]; + slottiny &xst = eq->round2trees[bucketid].treestiny[si[i]]; + + tt[i] = *(uint4*)(&xs.hash[0]); + lastword1[si[i]] = tt[i]; + ta[i] = xst.hash[0]; + lastword2[si[i]] = ta[i]; + asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[5]; + u32 bexor, xorbucketid, xorslot; + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[4] = ta[i] ^ lastword2[p]; + + if (xors[4] != 0) + { + *(uint4*)(&xors[0]) = tt[i] ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x2107); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = bexor; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + xors[4] = lastword2[i] ^ lastword2[k]; + + if (xors[4] != 0) + { + *(uint4*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + bexor = __byte_perm(xors[0], xors[1], 0x2107); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[3][xorbucketid], 1); + + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->round3trees[xorbucketid].treessmall[xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[1]); + slottiny &xst = eq->round3trees[xorbucketid].treestiny[xorslot]; + uint2 ttx; + ttx.x = bexor; + ttx.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xst.hash[0]) = ttx; + } + } + } +} + + +template +__global__ void digit_4(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + u32 bsize = umin(eq->edata.nslots[3][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + slotsmall &xs = eq->round3trees[bucketid].treessmall[si[i]]; + slottiny &xst = eq->round3trees[bucketid].treestiny[si[i]]; + + // get xhash + tt[i] = *(uint4*)(&xs.hash[0]); + lastword[si[i]] = tt[i]; + hr[i] = xst.hash[0] & RESTMASK; + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + u32 xors[4]; + u32 xorbucketid, xorslot; + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint4*)(&xors[0]) = tt[i] ^ lastword[p]; + + if (xors[3] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + + eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint4*)(&xors[0]) = lastword[i] ^ lastword[k]; + if (xors[3] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(4 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[4][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[3][xorbucketid][xorslot]; + *(uint4*)(&xs.hash[0]) = *(uint4*)(&xors[0]); + eq->round4bidandsids[xorbucketid][xorslot] = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + } + } + } +} + + +template +__global__ void digit_5(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint4 lastword[NSLOTS]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + if (threadid < NRESTS) + ht_len[threadid] = 0; + else if (threadid == (THREADS - 1)) + pairs_len = 0; + else if (threadid == (THREADS - 33)) + next_pair = 0; + + slotsmall* buck = eq->treessmall[3][bucketid]; + u32 bsize = umin(eq->edata.nslots[4][bucketid], NSLOTS); + + u32 hr[2]; + int pos[2]; + pos[0] = pos[1] = SSM; + + u32 si[2]; + uint4 tt[2]; + + // enable this to make fully safe shared mem operations; + // disabled gains some speed, but can rarely cause a crash + //__syncthreads(); + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + si[i] = i * THREADS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + tt[i] = *(uint4*)(&pslot1->hash[0]); + lastword[si[i]] = tt[i]; + asm("bfe.u32 %0, %1, 4, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + u32 xors[4]; + u32 bexor, xorbucketid, xorslot; + + #pragma unroll 2 + for (u32 i = 0; i < 2; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint4*)(&xors[0]) = tt[i] ^ lastword[p]; + + if (xors[3] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x1076); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = xors[3]; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 1; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + u32 i, k; + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + i = __byte_perm(pair, 0, 0x4510); + k = __byte_perm(pair, 0, 0x4532); + + *(uint4*)(&xors[0]) = lastword[i] ^ lastword[k]; + + if (xors[3] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x1076); + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(bexor), "r"(RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[5][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[2][xorbucketid][xorslot]; + uint4 tt; + tt.x = xors[1]; + tt.y = xors[2]; + tt.z = xors[3]; + tt.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint4*)(&xs.hash[0]) = tt; + } + } + } +} + + +template +__global__ void digit_6(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ uint2 lastword1[NSLOTS]; + __shared__ u32 lastword2[NSLOTS]; + __shared__ int ht_len[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + pairs_len = 0; + next_pair = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[5][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[2][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint4 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + tt[i] = *(uint4*)(&pslot1->hash[0]); + lastword1[si[i]] = *(uint2*)(&tt[i].x); + lastword2[si[i]] = tt[i].z; + asm("bfe.u32 %0, %1, 16, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + // doing this to save shared memory + int* pairs = ht_len; + __syncthreads(); + + u32 xors[3]; + u32 bexor, xorbucketid, xorslot; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + xors[2] = tt[i].z ^ lastword2[p]; + + if (xors[2] != 0) + { + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + xors[2] = tt[i].z ^ lastword2[p]; + + if (xors[2] != 0) + { + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ lastword1[p]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + u32 pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + xors[2] = lastword2[i] ^ lastword2[k]; + if (xors[2] == 0) + continue; + + *(uint2*)(&xors[0]) = lastword1[i] ^ lastword1[k]; + + bexor = __byte_perm(xors[0], xors[1], 0x1076); + xorbucketid = bexor >> (12 + RB); + xorslot = atomicAdd(&eq->edata.nslots[6][xorbucketid], 1); + if (xorslot >= NSLOTS) continue; + slotsmall &xs = eq->treessmall[0][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[1]; + ttx.y = xors[2]; + ttx.z = bexor; + ttx.w = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint4*)(&xs.hash[0]) = ttx; + } +} + + +template +__global__ void digit_7(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ u32 lastword[NSLOTS][2]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + pairs_len = 0; + next_pair = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[6][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[0][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint4 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + // get xhash + tt[i] = *(uint4*)(&pslot1->hash[0]); + *(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x); + asm("bfe.u32 %0, %1, 12, %2;" : "=r"(hr[i]) : "r"(tt[i].z), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[2]; + u32 xorbucketid, xorslot; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[0]; + ttx.y = xors[1]; + ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + ttx.w = 0; + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot < NSLOTS) + { + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 ttx; + ttx.x = xors[0]; + ttx.y = xors[1]; + ttx.z = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + ttx.w = 0; + *(uint4*)(&xs.hash[0]) = ttx; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]); + + if (xors[1] == 0) + continue; + + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(xorbucketid) : "r"(xors[0]), "r"(8 + RB), "r"(BUCKBITS)); + xorslot = atomicAdd(&eq->edata.nslots[7][xorbucketid], 1); + if (xorslot >= NSLOTS) continue; + slotsmall &xs = eq->treessmall[1][xorbucketid][xorslot]; + uint4 tt; + tt.x = xors[0]; + tt.y = xors[1]; + tt.z = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + tt.w = 0; + *(uint4*)(&xs.hash[0]) = tt; + } +} + + +template +__global__ void digit_8(equi* eq) +{ + __shared__ u16 ht[NRESTS][(SSM - 1)]; + __shared__ u32 lastword[NSLOTS][2]; + __shared__ int ht_len[NRESTS]; + __shared__ int pairs[MAXPAIRS]; + __shared__ u32 pairs_len; + __shared__ u32 bsize_sh; + __shared__ u32 next_pair; + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + ht_len[threadid] = 0; + if (threadid == (NRESTS - 1)) + { + next_pair = 0; + pairs_len = 0; + } + else if (threadid == (NRESTS - 33)) + bsize_sh = umin(eq->edata.nslots[7][bucketid], NSLOTS); + + slotsmall* buck = eq->treessmall[1][bucketid]; + + u32 hr[3]; + int pos[3]; + pos[0] = pos[1] = pos[2] = SSM; + + u32 si[3]; + uint2 tt[3]; + + __syncthreads(); + + u32 bsize = bsize_sh; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + si[i] = i * NRESTS + threadid; + if (si[i] >= bsize) break; + + const slotsmall* pslot1 = buck + si[i]; + + // get xhash + tt[i] = *(uint2*)(&pslot1->hash[0]); + *(uint2*)(&lastword[si[i]][0]) = *(uint2*)(&tt[i].x); + asm("bfe.u32 %0, %1, 8, %2;" : "=r"(hr[i]) : "r"(tt[i].x), "r"(RB)); + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) ht[hr[i]][pos[i]] = si[i]; + } + + __syncthreads(); + + u32 xors[2]; + u32 bexor, xorbucketid, xorslot; + + #pragma unroll 3 + for (u32 i = 0; i < 3; i++) + { + if (pos[i] >= SSM) continue; + + if (pos[i] > 0) + { + u16 p = ht[hr[i]][0]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot < RB8_NSLOTS_LD) + { + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } + } + + if (pos[i] > 1) + { + p = ht[hr[i]][1]; + + *(uint2*)(&xors[0]) = *(uint2*)(&tt[i].x) ^ *(uint2*)(&lastword[p][0]); + + if (xors[1] != 0) + { + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot < RB8_NSLOTS_LD) + { + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, si[i], p, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } + } + + for (int k = 2; k != pos[i]; ++k) + { + u32 pindex = atomicAdd(&pairs_len, 1); + if (pindex >= MAXPAIRS) break; + u16 prev = ht[hr[i]][k]; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + } + } + + __syncthreads(); + + // process pairs + u32 plen = umin(pairs_len, MAXPAIRS); + for (u32 s = atomicAdd(&next_pair, 1); s < plen; s = atomicAdd(&next_pair, 1)) + { + int pair = pairs[s]; + u32 i = __byte_perm(pair, 0, 0x4510); + u32 k = __byte_perm(pair, 0, 0x4532); + + *(uint2*)(&xors[0]) = *(uint2*)(&lastword[i][0]) ^ *(uint2*)(&lastword[k][0]); + + if (xors[1] == 0) + continue; + + bexor = __byte_perm(xors[0], xors[1], 0x0765); + xorbucketid = bexor >> (12 + 8); + xorslot = atomicAdd(&eq->edata.nslots8[xorbucketid], 1); + if (xorslot >= RB8_NSLOTS_LD) continue; + slottiny &xs = eq->treestiny[0][xorbucketid][xorslot]; + uint2 tt; + tt.x = xors[1]; + tt.y = PACKER::set_bucketid_and_slots(bucketid, i, k, RB, SM); + *(uint2*)(&xs.hash[0]) = tt; + } +} + +/* + Last round function is similar to previous ones but has different ending. + We use warps to process final candidates. Each warp process one candidate. + First two bidandsids (u32 of stored bucketid and two slotids) are retreived by + lane 0 and lane 16, next four bidandsids by lane 0, 8, 16 and 24, ... until + all lanes in warp have bidandsids from round 4. Next, each thread retreives + 16 indices. While doing so, indices are put into comparison using atomicExch + to determine if there are duplicates (tromp's method). At the end, if no + duplicates are found, candidate solution is saved (all indices). Note that this + dup check method is not exact so CPU dup checking is needed after. +*/ +template +__global__ void digit_last_wdc(equi* eq) +{ + __shared__ u8 shared_data[8192]; + int* ht_len = (int*)(&shared_data[0]); + int* pairs = ht_len; + u32* lastword = (u32*)(&shared_data[256 * 4]); + u16* ht = (u16*)(&shared_data[256 * 4 + RB8_NSLOTS_LD * 4]); + u32* pairs_len = (u32*)(&shared_data[8188]); + + const u32 threadid = threadIdx.x; + const u32 bucketid = blockIdx.x; + + // reset hashtable len + #pragma unroll + for (u32 i = 0; i < FCT; i++) + ht_len[(i * (256 / FCT)) + threadid] = 0; + + if (threadid == ((256 / FCT) - 1)) + *pairs_len = 0; + + slottiny* buck = eq->treestiny[0][bucketid]; + u32 bsize = umin(eq->edata.nslots8[bucketid], RB8_NSLOTS_LD); + + u32 si[3 * FCT]; + u32 hr[3 * FCT]; + int pos[3 * FCT]; + u32 lw[3 * FCT]; + + #pragma unroll + for (u32 i = 0; i < (3 * FCT); i++) + pos[i] = SSM; + + __syncthreads(); + + #pragma unroll + for (u32 i = 0; i < (3 * FCT); i++) + { + si[i] = i * (256 / FCT) + threadid; + if (si[i] >= bsize) break; + + const slottiny* pslot1 = buck + si[i]; + + // get xhash + uint2 tt = *(uint2*)(&pslot1->hash[0]); + lw[i] = tt.x; + lastword[si[i]] = lw[i]; + + u32 a; + asm("bfe.u32 %0, %1, 20, 8;" : "=r"(a) : "r"(lw[i])); + hr[i] = a; + + pos[i] = atomicAdd(&ht_len[hr[i]], 1); + if (pos[i] < (SSM - 1)) + ht[hr[i] * (SSM - 1) + pos[i]] = si[i]; + } + + __syncthreads(); + + #pragma unroll + for (u32 i = 0; i < (3 * FCT); i++) + { + if (pos[i] >= SSM) continue; + + for (int k = 0; k != pos[i]; ++k) + { + u16 prev = ht[hr[i] * (SSM - 1) + k]; + if (lw[i] != lastword[prev]) continue; + u32 pindex = atomicAdd(pairs_len, 1); + if (pindex >= MAXPAIRS) break; + pairs[pindex] = __byte_perm(si[i], prev, 0x1054); + } + } + + __syncthreads(); + u32 plen = umin(*pairs_len, 64); + +#define CALC_LEVEL(a, b, c, d) { \ + u32 plvl = levels[b]; \ + u32* bucks = eq->round4bidandsids[PACKER::get_bucketid(plvl, RB, SM)]; \ + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \ + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \ + levels[b] = bucks[slot1]; \ + levels[c] = bucks[slot0]; \ + } + +#define CALC_LEVEL_SMALL(a, b, c, d) { \ + u32 plvl = levels[b]; \ + slotsmall* bucks = eq->treessmall[a][PACKER::get_bucketid(plvl, RB, SM)]; \ + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); \ + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); \ + levels[b] = bucks[slot1].hash[d]; \ + levels[c] = bucks[slot0].hash[d]; \ + } + + u32 lane = threadIdx.x & 0x1f; + u32 par = threadIdx.x >> 5; + + u32* levels = (u32*)&pairs[MAXPAIRS + (par << DUPBITS)]; + u32* susp = levels; + + while (par < plen) + { + int pair = pairs[par]; + par += W; + + if (lane % 16 == 0) + { + u32 plvl; + if (lane == 0) plvl = buck[__byte_perm(pair, 0, 0x4510)].hash[1]; + else plvl = buck[__byte_perm(pair, 0, 0x4532)].hash[1]; + slotsmall* bucks = eq->treessmall[1][PACKER::get_bucketid(plvl, RB, SM)]; + u32 slot1 = PACKER::get_slot1(plvl, RB, SM); + u32 slot0 = PACKER::get_slot0(plvl, slot1, RB, SM); + levels[lane] = bucks[slot1].hash[2]; + levels[lane + 8] = bucks[slot0].hash[2]; + } + + if (lane % 8 == 0) + CALC_LEVEL_SMALL(0, lane, lane + 4, 3); + + if (lane % 4 == 0) + CALC_LEVEL_SMALL(2, lane, lane + 2, 3); + + if (lane % 2 == 0) + CALC_LEVEL(0, lane, lane + 1, 4); + + u32 ind[16]; + + u32 f1 = levels[lane]; + const slottiny* buck_v4 = &eq->round3trees[PACKER::get_bucketid(f1, RB, SM)].treestiny[0]; + const u32 slot1_v4 = PACKER::get_slot1(f1, RB, SM); + const u32 slot0_v4 = PACKER::get_slot0(f1, slot1_v4, RB, SM); + + susp[lane] = 0xffffffff; + susp[32 + lane] = 0xffffffff; + +#define CHECK_DUP(a) \ + __any(atomicExch(&susp[(ind[a] & ((1 << DUPBITS) - 1))], (ind[a] >> DUPBITS)) == (ind[a] >> DUPBITS)) + + u32 f2 = buck_v4[slot1_v4].hash[1]; + const slottiny* buck_v3_1 = &eq->round2trees[PACKER::get_bucketid(f2, RB, SM)].treestiny[0]; + const u32 slot1_v3_1 = PACKER::get_slot1(f2, RB, SM); + const u32 slot0_v3_1 = PACKER::get_slot0(f2, slot1_v3_1, RB, SM); + + susp[64 + lane] = 0xffffffff; + susp[96 + lane] = 0xffffffff; + + u32 f0 = buck_v3_1[slot1_v3_1].hash[1]; + const slot* buck_v2_1 = eq->trees[0][PACKER::get_bucketid(f0, RB, SM)]; + const u32 slot1_v2_1 = PACKER::get_slot1(f0, RB, SM); + const u32 slot0_v2_1 = PACKER::get_slot0(f0, slot1_v2_1, RB, SM); + + susp[128 + lane] = 0xffffffff; + susp[160 + lane] = 0xffffffff; + + u32 f3 = buck_v2_1[slot1_v2_1].hash[6]; + const slot* buck_fin_1 = eq->round0trees[packer_default::get_bucketid(f3, 8, RB8_NSLOTS)]; + const u32 slot1_fin_1 = packer_default::get_slot1(f3, 8, RB8_NSLOTS); + const u32 slot0_fin_1 = packer_default::get_slot0(f3, slot1_fin_1, 8, RB8_NSLOTS); + + susp[192 + lane] = 0xffffffff; + susp[224 + lane] = 0xffffffff; + + ind[0] = buck_fin_1[slot1_fin_1].hash[7]; + if (CHECK_DUP(0)) continue; + ind[1] = buck_fin_1[slot0_fin_1].hash[7]; + if (CHECK_DUP(1)) continue; + + u32 f4 = buck_v2_1[slot0_v2_1].hash[6]; + const slot* buck_fin_2 = eq->round0trees[packer_default::get_bucketid(f4, 8, RB8_NSLOTS)]; + const u32 slot1_fin_2 = packer_default::get_slot1(f4, 8, RB8_NSLOTS); + const u32 slot0_fin_2 = packer_default::get_slot0(f4, slot1_fin_2, 8, RB8_NSLOTS); + + ind[2] = buck_fin_2[slot1_fin_2].hash[7]; + if (CHECK_DUP(2)) continue; + ind[3] = buck_fin_2[slot0_fin_2].hash[7]; + if (CHECK_DUP(3)) continue; + + u32 f5 = buck_v3_1[slot0_v3_1].hash[1]; + const slot* buck_v2_2 = eq->trees[0][PACKER::get_bucketid(f5, RB, SM)]; + const u32 slot1_v2_2 = PACKER::get_slot1(f5, RB, SM); + const u32 slot0_v2_2 = PACKER::get_slot0(f5, slot1_v2_2, RB, SM); + + u32 f6 = buck_v2_2[slot1_v2_2].hash[6]; + const slot* buck_fin_3 = eq->round0trees[packer_default::get_bucketid(f6, 8, RB8_NSLOTS)]; + const u32 slot1_fin_3 = packer_default::get_slot1(f6, 8, RB8_NSLOTS); + const u32 slot0_fin_3 = packer_default::get_slot0(f6, slot1_fin_3, 8, RB8_NSLOTS); + + ind[4] = buck_fin_3[slot1_fin_3].hash[7]; + if (CHECK_DUP(4)) continue; + ind[5] = buck_fin_3[slot0_fin_3].hash[7]; + if (CHECK_DUP(5)) continue; + + u32 f7 = buck_v2_2[slot0_v2_2].hash[6]; + const slot* buck_fin_4 = eq->round0trees[packer_default::get_bucketid(f7, 8, RB8_NSLOTS)]; + const u32 slot1_fin_4 = packer_default::get_slot1(f7, 8, RB8_NSLOTS); + const u32 slot0_fin_4 = packer_default::get_slot0(f7, slot1_fin_4, 8, RB8_NSLOTS); + + ind[6] = buck_fin_4[slot1_fin_4].hash[7]; + if (CHECK_DUP(6)) continue; + ind[7] = buck_fin_4[slot0_fin_4].hash[7]; + if (CHECK_DUP(7)) continue; + + u32 f8 = buck_v4[slot0_v4].hash[1]; + const slottiny* buck_v3_2 = &eq->round2trees[PACKER::get_bucketid(f8, RB, SM)].treestiny[0]; + const u32 slot1_v3_2 = PACKER::get_slot1(f8, RB, SM); + const u32 slot0_v3_2 = PACKER::get_slot0(f8, slot1_v3_2, RB, SM); + + u32 f9 = buck_v3_2[slot1_v3_2].hash[1]; + const slot* buck_v2_3 = eq->trees[0][PACKER::get_bucketid(f9, RB, SM)]; + const u32 slot1_v2_3 = PACKER::get_slot1(f9, RB, SM); + const u32 slot0_v2_3 = PACKER::get_slot0(f9, slot1_v2_3, RB, SM); + + u32 f10 = buck_v2_3[slot1_v2_3].hash[6]; + const slot* buck_fin_5 = eq->round0trees[packer_default::get_bucketid(f10, 8, RB8_NSLOTS)]; + const u32 slot1_fin_5 = packer_default::get_slot1(f10, 8, RB8_NSLOTS); + const u32 slot0_fin_5 = packer_default::get_slot0(f10, slot1_fin_5, 8, RB8_NSLOTS); + + ind[8] = buck_fin_5[slot1_fin_5].hash[7]; + if (CHECK_DUP(8)) continue; + ind[9] = buck_fin_5[slot0_fin_5].hash[7]; + if (CHECK_DUP(9)) continue; + + u32 f11 = buck_v2_3[slot0_v2_3].hash[6]; + const slot* buck_fin_6 = eq->round0trees[packer_default::get_bucketid(f11, 8, RB8_NSLOTS)]; + const u32 slot1_fin_6 = packer_default::get_slot1(f11, 8, RB8_NSLOTS); + const u32 slot0_fin_6 = packer_default::get_slot0(f11, slot1_fin_6, 8, RB8_NSLOTS); + + ind[10] = buck_fin_6[slot1_fin_6].hash[7]; + if (CHECK_DUP(10)) continue; + ind[11] = buck_fin_6[slot0_fin_6].hash[7]; + if (CHECK_DUP(11)) continue; + + u32 f12 = buck_v3_2[slot0_v3_2].hash[1]; + const slot* buck_v2_4 = eq->trees[0][PACKER::get_bucketid(f12, RB, SM)]; + const u32 slot1_v2_4 = PACKER::get_slot1(f12, RB, SM); + const u32 slot0_v2_4 = PACKER::get_slot0(f12, slot1_v2_4, RB, SM); + + u32 f13 = buck_v2_4[slot1_v2_4].hash[6]; + const slot* buck_fin_7 = eq->round0trees[packer_default::get_bucketid(f13, 8, RB8_NSLOTS)]; + const u32 slot1_fin_7 = packer_default::get_slot1(f13, 8, RB8_NSLOTS); + const u32 slot0_fin_7 = packer_default::get_slot0(f13, slot1_fin_7, 8, RB8_NSLOTS); + + ind[12] = buck_fin_7[slot1_fin_7].hash[7]; + if (CHECK_DUP(12)) continue; + ind[13] = buck_fin_7[slot0_fin_7].hash[7]; + if (CHECK_DUP(13)) continue; + + u32 f14 = buck_v2_4[slot0_v2_4].hash[6]; + const slot* buck_fin_8 = eq->round0trees[packer_default::get_bucketid(f14, 8, RB8_NSLOTS)]; + const u32 slot1_fin_8 = packer_default::get_slot1(f14, 8, RB8_NSLOTS); + const u32 slot0_fin_8 = packer_default::get_slot0(f14, slot1_fin_8, 8, RB8_NSLOTS); + + ind[14] = buck_fin_8[slot1_fin_8].hash[7]; + if (CHECK_DUP(14)) continue; + ind[15] = buck_fin_8[slot0_fin_8].hash[7]; + if (CHECK_DUP(15)) continue; + + u32 soli; + if (lane == 0) { + soli = atomicAdd(&eq->edata.srealcont.nsols, 1); + } +#if __CUDA_ARCH__ >= 300 + // useful ? + soli = __shfl(soli, 0); +#else + __syncthreads(); +#endif + if (soli < MAXREALSOLS) + { + u32 pos = lane << 4; + *(uint4*)(&eq->edata.srealcont.sols[soli][pos ]) = *(uint4*)(&ind[ 0]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 4]) = *(uint4*)(&ind[ 4]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 8]) = *(uint4*)(&ind[ 8]); + *(uint4*)(&eq->edata.srealcont.sols[soli][pos + 12]) = *(uint4*)(&ind[12]); + } + } +} + +//std::mutex dev_init; +int dev_init_done[MAX_GPUS] = { 0 }; + +__host__ +static int compu32(const void *pa, const void *pb) +{ + uint32_t a = *(uint32_t *)pa, b = *(uint32_t *)pb; + return a b[i]) + { + need_sorting = 1; + tmp = a[i]; + a[i] = b[i]; + b[i] = tmp; + } + else if (a[i] < b[i]) + return; + } +} + +__host__ +static void setheader(blake2b_state *ctx, const char *header, const u32 headerLen, const char* nce, const u32 nonceLen) +{ + uint32_t le_N = WN; + uint32_t le_K = WK; + uchar personal[] = "ZcashPoW01230123"; + memcpy(personal + 8, &le_N, 4); + memcpy(personal + 12, &le_K, 4); + blake2b_param P[1]; + P->digest_length = HASHOUT; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + P->leaf_length = 0; + P->node_offset = 0; + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->salt, 0, sizeof(P->salt)); + memcpy(P->personal, (const uint8_t *)personal, 16); + eq_blake2b_init_param(ctx, P); + eq_blake2b_update(ctx, (const uchar *)header, headerLen); + if (nonceLen) eq_blake2b_update(ctx, (const uchar *)nce, nonceLen); +} + +#ifdef WIN32 +typedef CUresult(CUDAAPI *dec_cuDeviceGet)(CUdevice*, int); +typedef CUresult(CUDAAPI *dec_cuCtxCreate)(CUcontext*, unsigned int, CUdevice); +typedef CUresult(CUDAAPI *dec_cuCtxPushCurrent)(CUcontext); +typedef CUresult(CUDAAPI *dec_cuCtxDestroy)(CUcontext); + +dec_cuDeviceGet _cuDeviceGet = nullptr; +dec_cuCtxCreate _cuCtxCreate = nullptr; +dec_cuCtxPushCurrent _cuCtxPushCurrent = nullptr; +dec_cuCtxDestroy _cuCtxDestroy = nullptr; +#endif + +template +__host__ eq_cuda_context::eq_cuda_context(int thr_id, int dev_id) +{ + thread_id = thr_id; + device_id = dev_id; + solutions = nullptr; + equi_mem_sz = sizeof(equi); + throughput = NBLOCKS; + totalblocks = NBLOCKS/FD_THREADS; + threadsperblock = FD_THREADS; + threadsperblock_digits = THREADS; + + //dev_init.lock(); + if (!dev_init_done[device_id]) + { + // only first thread shall init device + checkCudaErrors(cudaSetDevice(device_id)); + checkCudaErrors(cudaDeviceReset()); + checkCudaErrors(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + + pctx = nullptr; + } + else + { + // create new context + CUdevice dev; + +#ifdef WIN32 + if (_cuDeviceGet == nullptr) + { + HMODULE hmod = LoadLibraryA("nvcuda.dll"); + if (hmod == NULL) + throw std::runtime_error("Failed to load nvcuda.dll"); + _cuDeviceGet = (dec_cuDeviceGet)GetProcAddress(hmod, "cuDeviceGet"); + if (_cuDeviceGet == nullptr) + throw std::runtime_error("Failed to get cuDeviceGet address"); + _cuCtxCreate = (dec_cuCtxCreate)GetProcAddress(hmod, "cuCtxCreate_v2"); + if (_cuCtxCreate == nullptr) + throw std::runtime_error("Failed to get cuCtxCreate address"); + _cuCtxPushCurrent = (dec_cuCtxPushCurrent)GetProcAddress(hmod, "cuCtxPushCurrent_v2"); + if (_cuCtxPushCurrent == nullptr) + throw std::runtime_error("Failed to get cuCtxPushCurrent address"); + _cuCtxDestroy = (dec_cuCtxDestroy)GetProcAddress(hmod, "cuCtxDestroy_v2"); + if (_cuCtxDestroy == nullptr) + throw std::runtime_error("Failed to get cuCtxDestroy address"); + } + + checkCudaDriverErrors(_cuDeviceGet(&dev, device_id)); + checkCudaDriverErrors(_cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev)); + checkCudaDriverErrors(_cuCtxPushCurrent(pctx)); +#else + checkCudaDriverErrors(cuDeviceGet(&dev, device_id)); + checkCudaDriverErrors(cuCtxCreate(&pctx, CU_CTX_SCHED_BLOCKING_SYNC, dev)); + checkCudaDriverErrors(cuCtxPushCurrent(pctx)); +#endif + } + ++dev_init_done[device_id]; + //dev_init.unlock(); + + if (cudaMalloc((void**)&device_eq, equi_mem_sz) != cudaSuccess) + throw std::runtime_error("CUDA: failed to alloc memory"); + + solutions = (scontainerreal*) malloc(sizeof(scontainerreal)); + if (!solutions) + throw std::runtime_error("EOM: failed to alloc solutions memory"); +} + +template +__host__ void eq_cuda_context::solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + fn_cancel cancelf, + fn_solution solutionf, + fn_hashdone hashdonef) +{ + blake2b_state blake_ctx; + + int blocks = NBUCKETS; + + setheader(&blake_ctx, tequihash_header, tequihash_header_len, nonce, nonce_len); + + // todo: improve + // djezo solver allows last 4 bytes of nonce to be iterrated + // this can be used to create internal loop - calc initial blake hash only once, then load 8*8 bytes on device (blake state h) + // then just iterate nn++ + // less CPU load, 1 cudaMemcpy less -> faster + //u32 nn = *(u32*)&nonce[28]; + u32 nn = 0; + + checkCudaErrors(cudaMemcpy(&device_eq->blake_h, &blake_ctx.h, sizeof(u64) * 8, cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemset(&device_eq->edata, 0, sizeof(device_eq->edata))); + + digit_first <<>>(device_eq, nn); + + digit_1 <<<4096, 512 >>>(device_eq); + digit_2 <<>>(device_eq); + digit_3 <<>>(device_eq); + + if (cancelf(thread_id)) return; + + digit_4 <<>>(device_eq); + digit_5 <<>>(device_eq); + + digit_6 <<>>(device_eq); + digit_7 <<>>(device_eq); + digit_8 <<>>(device_eq); + + digit_last_wdc <<<4096, 256 / 2 >>>(device_eq); + + checkCudaErrors(cudaMemcpy(solutions, &device_eq->edata.srealcont, (MAXREALSOLS * (512 * 4)) + 4, cudaMemcpyDeviceToHost)); + + //printf("T%d nsols: %u\n", thread_id, solutions->nsols); + //if (solutions->nsols > 9) + // printf("missing sol, total: %u\n", solutions->nsols); + + for (u32 s = 0; (s < solutions->nsols) && (s < MAXREALSOLS); s++) + { + // remove dups on CPU (dup removal on GPU is not fully exact and can pass on some invalid solutions) + if (duped(solutions->sols[s])) continue; + + // perform sort of pairs + for (uint32_t level = 0; level < 9; level++) + for (uint32_t i = 0; i < (1 << 9); i += (2 << level)) + sort_pair(&solutions->sols[s][i], 1 << level); + + std::vector index_vector(PROOFSIZE); + for (u32 i = 0; i < PROOFSIZE; i++) { + index_vector[i] = solutions->sols[s][i]; + } + + solutionf(thread_id, index_vector, DIGITBITS, nullptr); + } + + // ccminer: only use hashdonef if no solutions... + if (!solutions->nsols) + hashdonef(thread_id); +} + +// destructor +template +__host__ +eq_cuda_context::~eq_cuda_context() +{ + if (solutions) + free(solutions); + + if (device_eq) { + cudaFree(device_eq); + device_eq = NULL; + } + + if (pctx) { + // non primary thread, destroy context +#ifdef WIN32 + checkCudaDriverErrors(_cuCtxDestroy(pctx)); +#else + checkCudaDriverErrors(cuCtxDestroy(pctx)); +#endif + } else { + checkCudaErrors(cudaDeviceReset()); + dev_init_done[device_id] = 0; + } +} + + +#ifdef CONFIG_MODE_1 +template class eq_cuda_context; +#endif + +#ifdef CONFIG_MODE_2 +template class eq_cuda_context; +#endif + +#ifdef CONFIG_MODE_3 +template class eq_cuda_context; +#endif diff --git a/equi/eqcuda.hpp b/equi/eqcuda.hpp new file mode 100644 index 0000000..68bdaf0 --- /dev/null +++ b/equi/eqcuda.hpp @@ -0,0 +1,136 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef WIN32 +#define _SNPRINTF _snprintf +#else +#define _SNPRINTF snprintf +#endif + +#ifndef nullptr +#define nullptr NULL +#endif + +#ifdef WIN32 +#define rt_error std::runtime_error +#else +class rt_error : public std::runtime_error +{ +public: + explicit rt_error(const std::string& str) : std::runtime_error(str) {} +}; +#endif + +#define checkCudaErrors(call) \ +do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + char errorBuff[512]; \ + _SNPRINTF(errorBuff, sizeof(errorBuff) - 1, \ + "CUDA error '%s' in func '%s' line %d", \ + cudaGetErrorString(err), __FUNCTION__, __LINE__); \ + throw rt_error(errorBuff); \ + } \ +} while (0) + +#define checkCudaDriverErrors(call) \ +do { \ + CUresult err = call; \ + if (CUDA_SUCCESS != err) { \ + char errorBuff[512]; \ + _SNPRINTF(errorBuff, sizeof(errorBuff) - 1, \ + "CUDA error DRIVER: '%d' in func '%s' line %d", \ + err, __FUNCTION__, __LINE__); \ + throw rt_error(errorBuff); \ + } \ +} while (0) + +typedef uint64_t u64; +typedef uint32_t u32; +typedef uint16_t u16; +typedef uint8_t u8; +typedef unsigned char uchar; + +struct packer_default; +struct packer_cantor; + +#define MAXREALSOLS 9 + +struct scontainerreal { + u32 sols[MAXREALSOLS][512]; + u32 nsols; +}; + +#if 0 +#include +#define fn_solution std::function&, size_t, const unsigned char*)> +#define fn_hashdone std::function +#define fn_cancel std::function +#else +typedef void (*fn_solution)(int thr_id, const std::vector&, size_t, const unsigned char*); +typedef void (*fn_hashdone)(int thr_id); +typedef bool (*fn_cancel)(int thr_id); +#endif + +template struct equi; + +// --------------------------------------------------------------------------------------------------- + +struct eq_cuda_context_interface +{ + virtual ~eq_cuda_context_interface(); + + virtual void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + fn_cancel cancelf, + fn_solution solutionf, + fn_hashdone hashdonef); +public: + int thread_id; + int device_id; + int throughput; + int totalblocks; + int threadsperblock; + int threadsperblock_digits; + size_t equi_mem_sz; +}; + +// --------------------------------------------------------------------------------------------------- + +template +class eq_cuda_context : public eq_cuda_context_interface +{ + equi* device_eq; + scontainerreal* solutions; + CUcontext pctx; + + void solve(const char *tequihash_header, + unsigned int tequihash_header_len, + const char* nonce, + unsigned int nonce_len, + fn_cancel cancelf, + fn_solution solutionf, + fn_hashdone hashdonef); + +public: + eq_cuda_context(int thr_id, int dev_id); + ~eq_cuda_context(); +}; + +// RB, SM, SSM, TPB, PACKER... but any change only here will fail.. +#define CONFIG_MODE_1 9, 1248, 12, 640, packer_cantor +//#define CONFIG_MODE_2 8, 640, 12, 512, packer_default diff --git a/equi/equi-stratum.cpp b/equi/equi-stratum.cpp new file mode 100644 index 0000000..c9f6587 --- /dev/null +++ b/equi/equi-stratum.cpp @@ -0,0 +1,241 @@ +/** + * Equihash specific stratum protocol + * tpruvot@github - 2017 - Part under GPLv3 Licence + */ + +#include +#include +#include +#include + +#include "equihash.h" + +extern struct stratum_ctx stratum; +extern pthread_mutex_t stratum_work_lock; + +// ZEC uses a different scale to compute diff... +// sample targets to diff (stored in the reverse byte order in work->target) +// 0007fff800000000000000000000000000000000000000000000000000000000 is stratum diff 32 +// 003fffc000000000000000000000000000000000000000000000000000000000 is stratum diff 4 +// 00ffff0000000000000000000000000000000000000000000000000000000000 is stratum diff 1 +double target_to_diff_equi(uint32_t* target) +{ + uchar* tgt = (uchar*) target; + uint64_t m = + (uint64_t)tgt[30] << 24 | + (uint64_t)tgt[29] << 16 | + (uint64_t)tgt[28] << 8 | + (uint64_t)tgt[27] << 0; + + if (!m) + return 0.; + else + return (double)0xffff0000UL/m; +} + +void diff_to_target_equi(uint32_t *target, double diff) +{ + uint64_t m; + int k; + + for (k = 6; k > 0 && diff > 1.0; k--) + diff /= 4294967296.0; + m = (uint64_t)(4294901760.0 / diff); + if (m == 0 && k == 6) + memset(target, 0xff, 32); + else { + memset(target, 0, 32); + target[k + 1] = (uint32_t)(m >> 8); + target[k + 2] = (uint32_t)(m >> 40); + //memset(target, 0xff, 6*sizeof(uint32_t)); + for (k = 0; k < 28 && ((uint8_t*)target)[k] == 0; k++) + ((uint8_t*)target)[k] = 0xff; + } +} + +/* compute nbits to get the network diff */ +double equi_network_diff(struct work *work) +{ + //KMD bits: "1e 015971", + //KMD target: "00 00 015971000000000000000000000000000000000000000000000000000000", + //KMD bits: "1d 686aaf", + //KMD target: "00 0000 686aaf0000000000000000000000000000000000000000000000000000", + uint32_t nbits = work->data[26]; + uint32_t bits = (nbits & 0xffffff); + int16_t shift = (swab32(nbits) & 0xff); + shift = (31 - shift) * 8; // 8 bits shift for 0x1e, 16 for 0x1d + uint64_t tgt64 = swab32(bits); + tgt64 = tgt64 << shift; + // applog_hex(&tgt64, 8); + uint8_t net_target[32] = { 0 }; + for (int b=0; b<8; b++) + net_target[31-b] = ((uint8_t*)&tgt64)[b]; + // applog_hex(net_target, 32); + double d = target_to_diff_equi((uint32_t*)net_target); + return d; +} + +void equi_work_set_target(struct work* work, double diff) +{ + // target is given as data by the equihash stratum + // memcpy(work->target, stratum.job.claim, 32); // claim field is only used for lbry + diff_to_target_equi(work->target, diff); + //applog(LOG_BLUE, "diff %f to target :", diff); + //applog_hex(work->target, 32); + work->targetdiff = diff; +} + +bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params) +{ + uint8_t target_bin[32], target_be[32]; + + const char *target_hex = json_string_value(json_array_get(params, 0)); + if (!target_hex || strlen(target_hex) == 0) + return false; + + hex2bin(target_bin, target_hex, 32); + memset(target_be, 0xff, 32); + int filled = 0; + for (int i=0; i<32; i++) { + if (filled == 3) break; + target_be[31-i] = target_bin[i]; + if (target_bin[i]) filled++; + } + memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field + + pthread_mutex_lock(&stratum_work_lock); + sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be); + pthread_mutex_unlock(&stratum_work_lock); + + //applog(LOG_BLUE, "low diff %f", sctx->next_diff); + //applog_hex(target_be, 32); + + return true; +} + +bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params) +{ + const char *job_id, *version, *prevhash, *coinb1, *coinb2, *nbits, *stime; + size_t coinb1_size, coinb2_size; + bool clean, ret = false; + int ntime, i, p=0; + job_id = json_string_value(json_array_get(params, p++)); + version = json_string_value(json_array_get(params, p++)); + prevhash = json_string_value(json_array_get(params, p++)); + coinb1 = json_string_value(json_array_get(params, p++)); //merkle + coinb2 = json_string_value(json_array_get(params, p++)); //blank (reserved) + stime = json_string_value(json_array_get(params, p++)); + nbits = json_string_value(json_array_get(params, p++)); + clean = json_is_true(json_array_get(params, p)); p++; + + if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || + strlen(prevhash) != 64 || strlen(version) != 8 || + strlen(coinb1) != 64 || strlen(coinb2) != 64 || + strlen(nbits) != 8 || strlen(stime) != 8) { + applog(LOG_ERR, "Stratum notify: invalid parameters"); + goto out; + } + + /* store stratum server time diff */ + hex2bin((uchar *)&ntime, stime, 4); + ntime = ntime - (int) time(0); + if (ntime > sctx->srvtime_diff) { + sctx->srvtime_diff = ntime; + if (opt_protocol && ntime > 20) + applog(LOG_DEBUG, "stratum time is at least %ds in the future", ntime); + } + + pthread_mutex_lock(&stratum_work_lock); + hex2bin(sctx->job.version, version, 4); + hex2bin(sctx->job.prevhash, prevhash, 32); + + coinb1_size = strlen(coinb1) / 2; + coinb2_size = strlen(coinb2) / 2; + sctx->job.coinbase_size = coinb1_size + coinb2_size + // merkle + reserved + sctx->xnonce1_size + sctx->xnonce2_size; // extranonce and... + + sctx->job.coinbase = (uchar*) realloc(sctx->job.coinbase, sctx->job.coinbase_size); + hex2bin(sctx->job.coinbase, coinb1, coinb1_size); + hex2bin(sctx->job.coinbase + coinb1_size, coinb2, coinb2_size); + + sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + coinb2_size + sctx->xnonce1_size; + if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) + memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); + memcpy(sctx->job.coinbase + coinb1_size + coinb2_size, sctx->xnonce1, sctx->xnonce1_size); + + for (i = 0; i < sctx->job.merkle_count; i++) + free(sctx->job.merkle[i]); + free(sctx->job.merkle); + sctx->job.merkle = NULL; + sctx->job.merkle_count = 0; + + free(sctx->job.job_id); + sctx->job.job_id = strdup(job_id); + + hex2bin(sctx->job.nbits, nbits, 4); + hex2bin(sctx->job.ntime, stime, 4); + sctx->job.clean = clean; + + sctx->job.diff = sctx->next_diff; + pthread_mutex_unlock(&stratum_work_lock); + + ret = true; + +out: + return ret; +} + +void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data) +{ + int nonce = work->valid_nonces-1; + memcpy(work->extra, sol_data, 1347); + bn_store_hash_target_ratio(hash, work->target, work, nonce); + //work->sharediff[nonce] = target_to_diff_equi(hash); +} + +#define JSON_SUBMIT_BUF_LEN (4*1024) +// called by submit_upstream_work() +bool equi_stratum_submit(struct pool_infos *pool, struct work *work) +{ + char _ALIGN(64) s[JSON_SUBMIT_BUF_LEN]; + char _ALIGN(64) timehex[16] = { 0 }; + char *jobid, *noncestr, *solhex; + int idnonce = work->submit_nonce_id; + + // scanned nonce + work->data[EQNONCE_OFFSET] = work->nonces[idnonce]; + unsigned char * nonce = (unsigned char*) (&work->data[27]); + size_t nonce_len = 32 - stratum.xnonce1_size; + // long nonce without pool prefix (extranonce) + noncestr = bin2hex(&nonce[stratum.xnonce1_size], nonce_len); + + solhex = (char*) calloc(1, 1344*2 + 64); + if (!solhex || !noncestr) { + applog(LOG_ERR, "unable to alloc share memory"); + return false; + } + cbin2hex(solhex, (const char*) work->extra, 1347); + + jobid = work->job_id + 8; + sprintf(timehex, "%08x", swab32(work->data[25])); + + snprintf(s, sizeof(s), "{\"method\":\"mining.submit\",\"params\":" + "[\"%s\",\"%s\",\"%s\",\"%s\",\"%s\"], \"id\":%u}", + pool->user, jobid, timehex, noncestr, solhex, + stratum.job.shares_count + 10); + + free(solhex); + free(noncestr); + + gettimeofday(&stratum.tv_submit, NULL); + + if(!stratum_send_line(&stratum, s)) { + applog(LOG_ERR, "%s stratum_send_line failed", __func__); + return false; + } + + stratum.sharediff = work->sharediff[idnonce]; + stratum.job.shares_count++; + + return true; +} diff --git a/equi/equi.cpp b/equi/equi.cpp new file mode 100644 index 0000000..0a1d919 --- /dev/null +++ b/equi/equi.cpp @@ -0,0 +1,171 @@ +/* + * Port to Generic C of C++ implementation of the Equihash Proof-of-Work + * algorithm from zcashd. + * + * Copyright (c) 2016 abc at openwall dot com + * Copyright (c) 2016 Jack Grigg + * Copyright (c) 2016 The Zcash developers + * Copyright (c) 2017 tpruvot + * + * Distributed under the MIT software license, see the accompanying + * file COPYING or http://www.opensource.org/licenses/mit-license.php. + */ + +#include +#include +#include +#include + +#include "equihash.h" + +//#define USE_LIBSODIUM + +#ifdef USE_LIBSODIUM +#include "sodium.h" +#define blake2b_state crypto_generichash_blake2b_state +#else +#include "blake2/blake2.h" +#define be32toh(x) swab32(x) +#define htole32(x) (x) +#define HASHOUT 50 +#endif + +#include + +static void digestInit(blake2b_state *S, const uint32_t n, const uint32_t k) +{ + uint32_t le_N = htole32(n); + uint32_t le_K = htole32(k); +#ifdef USE_LIBSODIUM + uint8_t personalization[crypto_generichash_blake2b_PERSONALBYTES] = { 0 }; + + memcpy(personalization, "ZcashPoW", 8); + memcpy(personalization + 8, &le_N, 4); + memcpy(personalization + 12, &le_K, 4); + + crypto_generichash_blake2b_init_salt_personal(S, + NULL, 0, (512 / n) * n / 8, NULL, personalization); +#else + unsigned char personal[] = "ZcashPoW01230123"; + memcpy(personal + 8, &le_N, 4); + memcpy(personal + 12, &le_K, 4); + blake2b_param P[1]; + P->digest_length = HASHOUT; + P->key_length = 0; + P->fanout = 1; + P->depth = 1; + P->leaf_length = 0; + P->node_offset = 0; + P->node_depth = 0; + P->inner_length = 0; + memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->salt, 0, sizeof(P->salt)); + memcpy(P->personal, (const uint8_t *)personal, 16); + eq_blake2b_init_param(S, P); +#endif +} + +static void expandArray(const unsigned char *in, const uint32_t in_len, + unsigned char *out, const uint32_t out_len, + const uint32_t bit_len, const uint32_t byte_pad) +{ + assert(bit_len >= 8); + assert(8 * sizeof(uint32_t) >= 7 + bit_len); + + const uint32_t out_width = (bit_len + 7) / 8 + byte_pad; + assert(out_len == 8 * out_width * in_len / bit_len); + + const uint32_t bit_len_mask = ((uint32_t)1 << bit_len) - 1; + + // The acc_bits least-significant bits of acc_value represent a bit sequence + // in big-endian order. + uint32_t acc_bits = 0; + uint32_t acc_value = 0; + uint32_t j = 0; + + for (uint32_t i = 0; i < in_len; i++) + { + acc_value = (acc_value << 8) | in[i]; + acc_bits += 8; + + // When we have bit_len or more bits in the accumulator, write the next + // output element. + if (acc_bits >= bit_len) { + acc_bits -= bit_len; + for (uint32_t x = 0; x < byte_pad; x++) { + out[j + x] = 0; + } + for (uint32_t x = byte_pad; x < out_width; x++) { + out[j + x] = ( + // Big-endian + acc_value >> (acc_bits + (8 * (out_width - x - 1))) + ) & ( + // Apply bit_len_mask across byte boundaries + (bit_len_mask >> (8 * (out_width - x - 1))) & 0xFF + ); + } + j += out_width; + } + } +} + +static void generateHash(blake2b_state *S, const uint32_t g, uint8_t *hash, const size_t hashLen) +{ + const uint32_t le_g = htole32(g); + blake2b_state digest = *S; /* copy */ +#ifdef USE_LIBSODIUM + crypto_generichash_blake2b_update(&digest, (uint8_t *)&le_g, sizeof(le_g)); + crypto_generichash_blake2b_final(&digest, hash, hashLen); +#else + eq_blake2b_update(&digest, (const uint8_t*) &le_g, sizeof(le_g)); + eq_blake2b_final(&digest, hash, (uint8_t) (hashLen & 0xFF)); +#endif +} + +static int isZero(const uint8_t *hash, size_t len) +{ + // This doesn't need to be constant time. + for (size_t i = 0; i < len; i++) { + if (hash[i] != 0) return 0; + } + return 1; +} + +// hdr -> header including nonce (140 bytes) +// soln -> equihash solution (excluding 3 bytes with size, so 1344 bytes length) +bool equi_verify(uint8_t* const hdr, uint8_t* const soln) +{ + const uint32_t n = WN; // 200 + const uint32_t k = WK; // 9 + const uint32_t collisionBitLength = n / (k + 1); + const uint32_t collisionByteLength = (collisionBitLength + 7) / 8; + const uint32_t hashLength = (k + 1) * collisionByteLength; + const uint32_t indicesPerHashOutput = 512 / n; + const uint32_t hashOutput = indicesPerHashOutput * n / 8; + const uint32_t equihashSolutionSize = (1 << k) * (n / (k + 1) + 1) / 8; + const uint32_t solnr = 1 << k; + + uint32_t indices[512] = { 0 }; + uint8_t vHash[hashLength] = { 0 }; + + blake2b_state state; + digestInit(&state, n, k); +#ifdef USE_LIBSODIUM + crypto_generichash_blake2b_update(&state, hdr, 140); +#else + eq_blake2b_update(&state, hdr, 140); +#endif + + expandArray(soln, equihashSolutionSize, (uint8_t*) &indices, sizeof(indices), collisionBitLength + 1, 1); + + for (uint32_t j = 0; j < solnr; j++) { + uint8_t tmpHash[hashOutput]; + uint8_t hash[hashLength]; + uint32_t i = be32toh(indices[j]); + generateHash(&state, i / indicesPerHashOutput, tmpHash, hashOutput); + expandArray(tmpHash + (i % indicesPerHashOutput * n / 8), n / 8, hash, hashLength, collisionBitLength, 0); + for (uint32_t k = 0; k < hashLength; k++) + vHash[k] ^= hash[k]; + } + return isZero(vHash, sizeof(vHash)); +} diff --git a/equi/equihash.cpp b/equi/equihash.cpp new file mode 100644 index 0000000..2a6e514 --- /dev/null +++ b/equi/equihash.cpp @@ -0,0 +1,294 @@ +/** + * Equihash solver interface for ccminer (compatible with linux and windows) + * Solver taken from nheqminer, by djeZo (and NiceHash) + * tpruvot - 2017 (GPL v3) + */ +#include +#include +#include + +#include +#include + +#include + +#include "eqcuda.hpp" +#include "equihash.h" // equi_verify() + +#include + +// All solutions (BLOCK_HEADER_LEN + SOLSIZE_LEN + SOL_LEN) sha256d should be under the target +extern "C" void equi_hash(const void* input, void* output, int len) +{ + uint8_t _ALIGN(64) hash0[32], hash1[32]; + + sph_sha256_context ctx_sha256; + + sph_sha256_init(&ctx_sha256); + sph_sha256(&ctx_sha256, input, len); + sph_sha256_close(&ctx_sha256, hash0); + sph_sha256(&ctx_sha256, hash0, 32); + sph_sha256_close(&ctx_sha256, hash1); + + memcpy(output, hash1, 32); +} + +// input here is 140 for the header and 1344 for the solution (equi.cpp) +extern "C" int equi_verify_sol(void * const hdr, void * const sol) +{ + bool res = equi_verify((uint8_t*) hdr, (uint8_t*) sol); + + //applog_hex((void*)hdr, 140); + //applog_hex((void*)sol, 1344); + + return res ? 1 : 0; +} + +#include + +//#define EQNONCE_OFFSET 30 /* 27:34 */ +#define NONCE_OFT EQNONCE_OFFSET + +static bool init[MAX_GPUS] = { 0 }; +static int valid_sols[MAX_GPUS] = { 0 }; +static uint8_t _ALIGN(64) data_sols[MAX_GPUS][MAXREALSOLS][1536] = { 0 }; // 140+3+1344 required +static eq_cuda_context_interface* solvers[MAX_GPUS] = { NULL }; + +static void CompressArray(const unsigned char* in, size_t in_len, + unsigned char* out, size_t out_len, size_t bit_len, size_t byte_pad) +{ + assert(bit_len >= 8); + assert(8 * sizeof(uint32_t) >= 7 + bit_len); + + size_t in_width = (bit_len + 7) / 8 + byte_pad; + assert(out_len == bit_len*in_len / (8 * in_width)); + + uint32_t bit_len_mask = (1UL << bit_len) - 1; + + // The acc_bits least-significant bits of acc_value represent a bit sequence + // in big-endian order. + size_t acc_bits = 0; + uint32_t acc_value = 0; + + size_t j = 0; + for (size_t i = 0; i < out_len; i++) { + // When we have fewer than 8 bits left in the accumulator, read the next + // input element. + if (acc_bits < 8) { + acc_value = acc_value << bit_len; + for (size_t x = byte_pad; x < in_width; x++) { + acc_value = acc_value | ( + ( + // Apply bit_len_mask across byte boundaries + in[j + x] & ((bit_len_mask >> (8 * (in_width - x - 1))) & 0xFF) + ) << (8 * (in_width - x - 1))); // Big-endian + } + j += in_width; + acc_bits += bit_len; + } + + acc_bits -= 8; + out[i] = (acc_value >> acc_bits) & 0xFF; + } +} + +#ifndef htobe32 +#define htobe32(x) swab32(x) +#endif + +static void EhIndexToArray(const u32 i, unsigned char* arr) +{ + u32 bei = htobe32(i); + memcpy(arr, &bei, sizeof(u32)); +} + +static std::vector GetMinimalFromIndices(std::vector indices, size_t cBitLen) +{ + assert(((cBitLen + 1) + 7) / 8 <= sizeof(u32)); + size_t lenIndices = indices.size()*sizeof(u32); + size_t minLen = (cBitLen + 1)*lenIndices / (8 * sizeof(u32)); + size_t bytePad = sizeof(u32) - ((cBitLen + 1) + 7) / 8; + std::vector array(lenIndices); + for (size_t i = 0; i < indices.size(); i++) { + EhIndexToArray(indices[i], array.data() + (i*sizeof(u32))); + } + std::vector ret(minLen); + CompressArray(array.data(), lenIndices, ret.data(), minLen, cBitLen + 1, bytePad); + return ret; +} + +// solver callbacks +static void cb_solution(int thr_id, const std::vector& solutions, size_t cbitlen, const unsigned char *compressed_sol) +{ + std::vector nSolution; + if (!compressed_sol) { + nSolution = GetMinimalFromIndices(solutions, cbitlen); + } else { + gpulog(LOG_INFO, thr_id, "compressed_sol"); + nSolution = std::vector(1344); + for (size_t i = 0; i < cbitlen; i++) + nSolution[i] = compressed_sol[i]; + } + int nsol = valid_sols[thr_id]; + if (nsol < 0) nsol = 0; + if(nSolution.size() == 1344) { + // todo, only store solution data here... + le32enc(&data_sols[thr_id][nsol][140], 0x000540fd); // sol sz header + memcpy(&data_sols[thr_id][nsol][143], nSolution.data(), 1344); + valid_sols[thr_id] = nsol + 1; + } +} +static void cb_hashdone(int thr_id) { + if (!valid_sols[thr_id]) valid_sols[thr_id] = -1; +} +static bool cb_cancel(int thr_id) { + if (work_restart[thr_id].restart) + valid_sols[thr_id] = -1; + return work_restart[thr_id].restart; +} + +extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(64) endiandata[35]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[NONCE_OFT]; + uint32_t nonce_increment = rand() & 0xFF; // nonce randomizer + struct timeval tv_start, tv_end, diff; + double secs, solps; + uint32_t soluce_count = 0; + + if (opt_benchmark) + ptarget[7] = 0xfffff; + + if (!init[thr_id]) { + try { + int mode = 1; + switch (mode) { + case 1: + solvers[thr_id] = new eq_cuda_context(thr_id, device_map[thr_id]); + break; +#ifdef CONFIG_MODE_2 + case 2: + solvers[thr_id] = new eq_cuda_context(thr_id, device_map[thr_id]); + break; +#endif +#ifdef CONFIG_MODE_3 + case 3: + solvers[thr_id] = new eq_cuda_context(thr_id, device_map[thr_id]); + break; +#endif + default: + proper_exit(EXIT_CODE_SW_INIT_ERROR); + return -1; + } + size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024); + gpus_intensity[thr_id] = (uint32_t) solvers[thr_id]->throughput; + api_set_throughput(thr_id, gpus_intensity[thr_id]); + gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz); + cuda_get_arch(thr_id); + init[thr_id] = true; + } catch (const std::exception & e) { + CUDA_LOG_ERROR(); + gpulog(LOG_ERR, thr_id, "init: %s", e.what()); + proper_exit(EXIT_CODE_CUDA_ERROR); + } + } + + gettimeofday(&tv_start, NULL); + memcpy(endiandata, pdata, 140); + work->valid_nonces = 0; + + do { + + try { + + valid_sols[thr_id] = 0; + solvers[thr_id]->solve( + (const char *) endiandata, (unsigned int) (140 - 32), + (const char *) &endiandata[27], (unsigned int) 32, + &cb_cancel, &cb_solution, &cb_hashdone + ); + + *hashes_done = soluce_count; + + } catch (const std::exception & e) { + gpulog(LOG_WARNING, thr_id, "solver: %s", e.what()); + free_equihash(thr_id); + sleep(1); + return -1; + } + + if (valid_sols[thr_id] > 0) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + uint8_t _ALIGN(64) full_data[140+3+1344] = { 0 }; + uint8_t* sol_data = &full_data[140]; + + soluce_count += valid_sols[thr_id]; + + for (int nsol=0; nsol < valid_sols[thr_id]; nsol++) + { + memcpy(full_data, endiandata, 140); + memcpy(sol_data, &data_sols[thr_id][nsol][140], 1347); + equi_hash(full_data, vhash, 140+3+1344); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) + { + bool valid = equi_verify_sol(endiandata, &sol_data[3]); + if (valid && work->valid_nonces < MAX_NONCES) { + work->valid_nonces++; + memcpy(work->data, endiandata, 140); + equi_store_work_solution(work, vhash, sol_data); + work->nonces[work->valid_nonces-1] = endiandata[NONCE_OFT]; + pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1; + //applog_hex(vhash, 32); + //applog_hex(&work->data[27], 32); + goto out; // second solution storage not handled.. + } + } + if (work->valid_nonces == MAX_NONCES) goto out; + } + if (work->valid_nonces) + goto out; + + valid_sols[thr_id] = 0; + } + + endiandata[NONCE_OFT] += nonce_increment; + + } while (!work_restart[thr_id].restart); + +out: + gettimeofday(&tv_end, NULL); + timeval_subtract(&diff, &tv_end, &tv_start); + secs = (1.0 * diff.tv_sec) + (0.000001 * diff.tv_usec); + solps = (double)soluce_count / secs; + gpulog(LOG_DEBUG, thr_id, "%d solutions in %.2f s (%.2f Sol/s)", soluce_count, secs, solps); + + // H/s + *hashes_done = soluce_count; + + pdata[NONCE_OFT] = endiandata[NONCE_OFT] + 1; + + return work->valid_nonces; +} + +// cleanup +void free_equihash(int thr_id) +{ + if (!init[thr_id]) + return; + + delete(solvers[thr_id]); + solvers[thr_id] = NULL; + + init[thr_id] = false; +} + +// mmm... viva c++ junk +void eq_cuda_context_interface::solve(const char *tequihash_header, unsigned int tequihash_header_len, + const char* nonce, unsigned int nonce_len, + fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef) { } +eq_cuda_context_interface::~eq_cuda_context_interface() { } diff --git a/equi/equihash.h b/equi/equihash.h new file mode 100644 index 0000000..cdf47fa --- /dev/null +++ b/equi/equihash.h @@ -0,0 +1,19 @@ +#ifndef EQUIHASH_H +#define EQUIHASH_H + +#include + +// miner nonce "cursor" unique for each thread +#define EQNONCE_OFFSET 30 /* 27:34 */ + +#define WK 9 +#define WN 200 +//#define CONFIG_MODE_1 9, 1248, 12, 640, packer_cantor /* eqcuda.hpp */ + +extern "C" { + void equi_hash(const void* input, void* output, int len); + int equi_verify_sol(void* const hdr, void* const soln); + bool equi_verify(uint8_t* const hdr, uint8_t* const soln); +} + +#endif diff --git a/miner.h b/miner.h index f720bc7..307ff7d 100644 --- a/miner.h +++ b/miner.h @@ -282,6 +282,7 @@ extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonc extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -340,6 +341,7 @@ extern void free_cryptolight(int thr_id); extern void free_cryptonight(int thr_id); extern void free_decred(int thr_id); extern void free_deep(int thr_id); +extern void free_equihash(int thr_id); extern void free_keccak256(int thr_id); extern void free_fresh(int thr_id); extern void free_fugue256(int thr_id); @@ -604,6 +606,7 @@ void cuda_clear_lasterror(); #define CL_WHT "\x1B[01;37m" /* white */ extern void format_hashrate(double hashrate, char *output); +extern void format_hashrate_unit(double hashrate, char *output, const char* unit); extern void applog(int prio, const char *fmt, ...); extern void gpulog(int prio, int thr_id, const char *fmt, ...); @@ -679,6 +682,7 @@ struct stratum_ctx { time_t tm_connected; int rpc2; + int is_equihash; int srvtime_diff; }; @@ -722,6 +726,8 @@ struct work { /* pok getwork txs */ uint32_t tx_count; struct tx txs[POK_MAX_TXS]; + // zec solution + uint8_t extra[1388]; }; #define POK_BOOL_MASK 0x00008000 @@ -804,6 +810,14 @@ void stratum_free_job(struct stratum_ctx *sctx); bool rpc2_stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); +bool equi_stratum_notify(struct stratum_ctx *sctx, json_t *params); +bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params); +bool equi_stratum_submit(struct pool_infos *pool, struct work *work); +void equi_work_set_target(struct work* work, double diff); +void equi_store_work_solution(struct work* work, uint32_t* hash, void* sol_data); +int equi_verify_sol(void * const hdr, void * const sol); +double equi_network_diff(struct work *work); + void hashlog_remember_submit(struct work* work, uint32_t nonce); void hashlog_remember_scan_range(struct work* work); double hashlog_get_sharediff(char* jobid, int idnonce, double defvalue); diff --git a/util.cpp b/util.cpp index 7c2878b..e15fbf6 100644 --- a/util.cpp +++ b/util.cpp @@ -211,35 +211,31 @@ void get_defconfig_path(char *out, size_t bufsize, char *argv0) #endif } -void format_hashrate(double hashrate, char *output) +void format_hashrate_unit(double hashrate, char *output, const char *unit) { - char prefix = '\0'; + char prefix[2] = { 0, 0 }; if (hashrate < 10000) { // nop } else if (hashrate < 1e7) { - prefix = 'k'; + prefix[0] = 'k'; hashrate *= 1e-3; } else if (hashrate < 1e10) { - prefix = 'M'; + prefix[0] = 'M'; hashrate *= 1e-6; } else if (hashrate < 1e13) { - prefix = 'G'; + prefix[0] = 'G'; hashrate *= 1e-9; } else { - prefix = 'T'; + prefix[0] = 'T'; hashrate *= 1e-12; } - sprintf( - output, - prefix ? "%.2f %cH/s" : "%.2f H/s%c", - hashrate, prefix - ); + sprintf(output, "%.2f %s%s", hashrate, prefix, unit); } static void databuf_free(struct data_buffer *db) @@ -1179,14 +1175,27 @@ static bool stratum_parse_extranonce(struct stratum_ctx *sctx, json_t *params, i } xn2_size = (int) json_integer_value(json_array_get(params, pndx+1)); if (!xn2_size) { - applog(LOG_ERR, "Failed to get extranonce2_size"); - goto out; + char algo[64] = { 0 }; + get_currentalgo(algo, sizeof(algo)); + if (strcmp(algo, "equihash") == 0) { + int xn1_size = (int)strlen(xnonce1) / 2; + xn2_size = 32 - xn1_size; + if (xn1_size < 4 || xn1_size > 12) { + // This miner iterates the nonces at data32[30] + applog(LOG_ERR, "Unsupported extranonce size of %d (12 maxi)", xn1_size); + goto out; + } + goto skip_n2; + } else { + applog(LOG_ERR, "Failed to get extranonce2_size"); + goto out; + } } if (xn2_size < 2 || xn2_size > 16) { - applog(LOG_INFO, "Failed to get valid n2size in parse_extranonce"); + applog(LOG_ERR, "Failed to get valid n2size in parse_extranonce (%d)", xn2_size); goto out; } - +skip_n2: pthread_mutex_lock(&stratum_work_lock); if (sctx->xnonce1) free(sctx->xnonce1); @@ -1441,6 +1450,10 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) get_currentalgo(algo, sizeof(algo)); bool has_claim = !strcasecmp(algo, "lbry"); + if (sctx->is_equihash) { + return equi_stratum_notify(sctx, params); + } + job_id = json_string_value(json_array_get(params, p++)); prevhash = json_string_value(json_array_get(params, p++)); if (has_claim) { @@ -1842,6 +1855,11 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) ret = stratum_set_difficulty(sctx, params); goto out; } + if (!strcasecmp(method, "mining.set_target")) { + sctx->is_equihash = true; + ret = equi_stratum_set_target(sctx, params); + goto out; + } if (!strcasecmp(method, "mining.set_extranonce")) { ret = stratum_parse_extranonce(sctx, params, 0); goto out;