From 1b7c2fc296184ac577f0a238c613b1f42f3fc339 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 7 Mar 2017 20:05:03 +0100 Subject: [PATCH] lyra2z algo (temporary algo) based on djm34 version, cleaned up and adapted to ccminer 2.0 --- Makefile.am | 1 + algos.h | 2 + bench.cpp | 1 + ccminer.cpp | 6 + ccminer.vcxproj | 20 +- ccminer.vcxproj.filters | 15 + lyra2/Lyra2Z.c | 215 +++++++++ lyra2/Lyra2Z.h | 42 ++ lyra2/cuda_lyra2Z.cu | 966 +++++++++++++++++++++++++++++++++++++ lyra2/cuda_lyra2Z_sm5.cuh | 819 +++++++++++++++++++++++++++++++ lyra2/cuda_lyra2_vectors.h | 8 +- lyra2/lyra2Z.cu | 164 +++++++ miner.h | 3 + util.cpp | 3 + 14 files changed, 2252 insertions(+), 13 deletions(-) create mode 100644 lyra2/Lyra2Z.c create mode 100644 lyra2/Lyra2Z.h create mode 100644 lyra2/cuda_lyra2Z.cu create mode 100644 lyra2/cuda_lyra2Z_sm5.cuh create mode 100644 lyra2/lyra2Z.cu diff --git a/Makefile.am b/Makefile.am index 19dfc0c..e4186b0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -34,6 +34,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ lyra2/Lyra2.c lyra2/Sponge.c \ lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \ lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ + lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \ Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \ diff --git a/algos.h b/algos.h index e16067a..a93219c 100644 --- a/algos.h +++ b/algos.h @@ -26,6 +26,7 @@ enum sha_algos { ALGO_LUFFA, ALGO_LYRA2, ALGO_LYRA2v2, + ALGO_LYRA2Z, ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MYR_GR, ALGO_NEOSCRYPT, @@ -82,6 +83,7 @@ static const char *algo_names[] = { "luffa", "lyra2", "lyra2v2", + "lyra2z", "mjollnir", "myr-gr", "neoscrypt", diff --git a/bench.cpp b/bench.cpp index 7a36908..46db206 100644 --- a/bench.cpp +++ b/bench.cpp @@ -63,6 +63,7 @@ void algo_free_all(int thr_id) free_luffa(thr_id); free_lyra2(thr_id); free_lyra2v2(thr_id); + free_lyra2Z(thr_id); free_myriad(thr_id); free_neoscrypt(thr_id); free_nist5(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 5729c2a..a1fae8d 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -244,6 +244,7 @@ Options:\n\ luffa Joincoin\n\ lyra2 CryptoCoin\n\ lyra2v2 VertCoin\n\ + lyra2z ZeroCoin (3rd impl)\n\ mjollnir Mjollnircoin\n\ myr-gr Myriad-Groestl\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ @@ -1616,6 +1617,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_GROESTL: case ALGO_LBRY: case ALGO_LYRA2v2: + case ALGO_LYRA2Z: work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty)); break; case ALGO_KECCAK: @@ -2131,6 +2133,7 @@ static void *miner_thread(void *userdata) minmax = 0x300000; break; case ALGO_LYRA2: + case ALGO_LYRA2Z: case ALGO_NEOSCRYPT: case ALGO_SIB: case ALGO_SCRYPT: @@ -2272,6 +2275,9 @@ static void *miner_thread(void *userdata) case ALGO_LYRA2v2: rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_LYRA2Z: + rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_NEOSCRYPT: rc = scanhash_neoscrypt(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 321b366..9e926a0 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -39,10 +39,10 @@ - + - + @@ -256,7 +256,7 @@ - + @@ -383,7 +383,7 @@ - + @@ -505,6 +505,11 @@ + + + + + 64 @@ -567,11 +572,8 @@ - - - - - + + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index b2e0d04..0febc2c 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -255,6 +255,9 @@ Source Files\sph + + Source Files\sph + Source Files\CUDA\scrypt @@ -473,6 +476,9 @@ Header Files\lyra2 + + Header Files\lyra2 + Header Files\lyra2 @@ -506,6 +512,9 @@ Source Files\CUDA\lyra2 + + Source Files\CUDA\lyra2 + Source Files\CUDA\quark @@ -820,6 +829,12 @@ Source Files\CUDA\lyra2 + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + Source Files\CUDA\Algo256 diff --git a/lyra2/Lyra2Z.c b/lyra2/Lyra2Z.c new file mode 100644 index 0000000..edf463b --- /dev/null +++ b/lyra2/Lyra2Z.c @@ -0,0 +1,215 @@ +/** + * Implementation of the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include +#include + +#include "Lyra2Z.h" +#include "Sponge.h" + +/** + * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords + * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, + * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all + * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value + * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). + * + * @param K The derived key to be output by the algorithm + * @param kLen Desired key length + * @param pwd User password + * @param pwdlen Password length + * @param salt Salt + * @param saltlen Salt length + * @param timeCost Parameter to determine the processing time (T) + * @param nRows Number or rows of the memory matrix (R) + * @param nCols Number of columns of the memory matrix (C) + * + * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) + */ +int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols) +{ + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + int64_t v64; // 64bit var for memcpy + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64; + + size_t sz = (size_t)ROW_LEN_BYTES * nRows; + uint64_t *wholeMatrix = malloc(sz); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, sz); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + + byte *ptrByte = (byte*) wholeMatrix; + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen))); + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t state[16]; + initState(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) { + absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; + do { + //Selects a pseudorandom index row* + //------------------------------------------------------------------------------------------ + rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock(state, memMatrix[rowa]); + + //Squeezes the key + squeeze(state, K, (unsigned int) kLen); + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + return 0; +} + diff --git a/lyra2/Lyra2Z.h b/lyra2/Lyra2Z.h new file mode 100644 index 0000000..aaade36 --- /dev/null +++ b/lyra2/Lyra2Z.h @@ -0,0 +1,42 @@ +/** + * Header file for the Lyra2 Password Hashing Scheme (PHS). + * + * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014. + * + * This software is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef LYRA2Z_H_ +#define LYRA2Z_H_ + +#include + +typedef unsigned char byte; + +//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED) +#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t) +#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes + + +#ifdef BLOCK_LEN_BITS + #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes +#else //default block lenght: 768 bits + #define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t) + #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes +#endif + +int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); + +#endif /* LYRA2_H_ */ diff --git a/lyra2/cuda_lyra2Z.cu b/lyra2/cuda_lyra2Z.cu new file mode 100644 index 0000000..1dc215a --- /dev/null +++ b/lyra2/cuda_lyra2Z.cu @@ -0,0 +1,966 @@ +/** + * Lyra2 (v1) cuda implementation based on djm34 work + * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2) + * Lyra2Z implentation for Zcoin based on all the previous + * djm34 2017 + **/ + +#include +#include + +#define TPB52 32 +#define TPB30 160 +#define TPB20 160 + +#include "cuda_lyra2Z_sm5.cuh" + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c); +#define atomicMin() +#define __CUDA_ARCH__ 520 +#endif + +static uint32_t *h_GNonces[16]; // this need to get fixed as the rest of that routine +static uint32_t *d_GNonces[16]; + +#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ + for (int i = 0; i < 8; i++) { \ + for (int j = 0; j < 12; j++) \ + state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ + round_lyra_sm2(state); \ + for (int j = 0; j < 12; j++) \ + Matrix[j + 12 * i][rowOut] ^= state[j]; \ + Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ + Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ + Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ + Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ + Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ + Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ + Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ + Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ + Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ + Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ + Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ + Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ + } \ + } + +#define absorbblock(in) { \ + state[0] ^= Matrix[0][in]; \ + state[1] ^= Matrix[1][in]; \ + state[2] ^= Matrix[2][in]; \ + state[3] ^= Matrix[3][in]; \ + state[4] ^= Matrix[4][in]; \ + state[5] ^= Matrix[5][in]; \ + state[6] ^= Matrix[6][in]; \ + state[7] ^= Matrix[7][in]; \ + state[8] ^= Matrix[8][in]; \ + state[9] ^= Matrix[9][in]; \ + state[10] ^= Matrix[10][in]; \ + state[11] ^= Matrix[11][in]; \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + round_lyra_sm2(state); \ + } + +__device__ __forceinline__ +static void round_lyra_sm2(uint2 *s) +{ + Gfunc(s[0], s[4], s[8], s[12]); + Gfunc(s[1], s[5], s[9], s[13]); + Gfunc(s[2], s[6], s[10], s[14]); + Gfunc(s[3], s[7], s[11], s[15]); + Gfunc(s[0], s[5], s[10], s[15]); + Gfunc(s[1], s[6], s[11], s[12]); + Gfunc(s[2], s[7], s[8], s[13]); + Gfunc(s[3], s[4], s[9], s[14]); +} + +__device__ __forceinline__ +void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8]) +{ +#if __CUDA_ARCH__ > 500 +#pragma unroll +#endif + for (int i = 0; i < 8; i++) + { + #pragma unroll + for (int j = 0; j < 12; j++) + state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; + + round_lyra_sm2(state); + + #pragma unroll + for (int j = 0; j < 12; j++) + Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; + + Matrix[0 + 12 * i][rowInOut] ^= state[11]; + Matrix[1 + 12 * i][rowInOut] ^= state[0]; + Matrix[2 + 12 * i][rowInOut] ^= state[1]; + Matrix[3 + 12 * i][rowInOut] ^= state[2]; + Matrix[4 + 12 * i][rowInOut] ^= state[3]; + Matrix[5 + 12 * i][rowInOut] ^= state[4]; + Matrix[6 + 12 * i][rowInOut] ^= state[5]; + Matrix[7 + 12 * i][rowInOut] ^= state[6]; + Matrix[8 + 12 * i][rowInOut] ^= state[7]; + Matrix[9 + 12 * i][rowInOut] ^= state[8]; + Matrix[10 + 12 * i][rowInOut] ^= state[9]; + Matrix[11 + 12 * i][rowInOut] ^= state[10]; + } +} + +#if __CUDA_ARCH__ < 350 + +__constant__ static uint2 blake2b_IV_sm2[8] = { + { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, + { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }, + { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, + { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } +}; + +__global__ __launch_bounds__(TPB30, 1) +void lyra2Z_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNonces) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint2 Mask[8] = { + { 0x00000020, 0x00000000 },{ 0x00000020, 0x00000000 }, + { 0x00000020, 0x00000000 },{ 0x00000008, 0x00000000 }, + { 0x00000008, 0x00000000 },{ 0x00000008, 0x00000000 }, + { 0x00000080, 0x00000000 },{ 0x00000000, 0x01000000 } + }; + if (thread < threads) + { + uint2 state[16]; + + #pragma unroll + for (int i = 0; i<4; i++) { + LOHI(state[i].x, state[i].y, g_hash[threads*i + thread]); + } //password + + #pragma unroll + for (int i = 0; i<4; i++) { + state[i + 4] = state[i]; + } //salt + + #pragma unroll + for (int i = 0; i<8; i++) { + state[i + 8] = blake2b_IV_sm2[i]; + } + + // blake2blyra x2 + //#pragma unroll 24 + for (int i = 0; i<12; i++) { + round_lyra_sm2(state); + } + + for (int i = 0; i<8; i++) + state[i] ^= Mask[i]; + + for (int i = 0; i<12; i++) { + round_lyra_sm2(state); + } + + uint2 Matrix[96][8]; // not cool + + // reducedSqueezeRow0 + #pragma unroll 8 + for (int i = 0; i < 8; i++) + { + #pragma unroll 12 + for (int j = 0; j<12; j++) { + Matrix[j + 84 - 12 * i][0] = state[j]; + } + round_lyra_sm2(state); + } + + // reducedSqueezeRow1 + #pragma unroll 8 + for (int i = 0; i < 8; i++) + { + #pragma unroll 12 + for (int j = 0; j<12; j++) { + state[j] ^= Matrix[j + 12 * i][0]; + } + round_lyra_sm2(state); + #pragma unroll 12 + for (int j = 0; j<12; j++) { + Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; + } + } + + reduceDuplexRowSetup(1, 0, 2, state, Matrix); + reduceDuplexRowSetup(2, 1, 3, state, Matrix); + reduceDuplexRowSetup(3, 0, 4, state, Matrix); + reduceDuplexRowSetup(4, 3, 5, state, Matrix); + reduceDuplexRowSetup(5, 2, 6, state, Matrix); + reduceDuplexRowSetup(6, 1, 7, state, Matrix); + + uint32_t rowa; + uint32_t prev = 7; + uint32_t iterator = 0; + for (uint32_t i = 0; i<8; i++) { + rowa = state[0].x & 7; + reduceDuplexRow(prev, rowa, iterator); + prev = iterator; + iterator = (iterator + 3) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = state[0].x & 7; + reduceDuplexRow(prev, rowa, iterator); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = state[0].x & 7; + reduceDuplexRow(prev, rowa, iterator); + prev = iterator; + iterator = (iterator + 3) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = state[0].x & 7; + reduceDuplexRow(prev, rowa, iterator); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = state[0].x & 7; + reduceDuplexRow(prev, rowa, iterator); + prev = iterator; + iterator = (iterator + 3) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = state[0].x & 7; + reduceDuplexRow(prev, rowa, iterator); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = state[0].x & 7; + reduceDuplexRow(prev, rowa, iterator); + prev = iterator; + iterator = (iterator + 3) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = state[0].x & 7; + reduceDuplexRow(prev, rowa, iterator); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + absorbblock(rowa); + uint32_t nonce = startNounce + thread; + if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) { + atomicMin(&resNonces[1], resNonces[0]); + atomicMin(&resNonces[0], nonce); + } + } //thread +} +#else +__global__ void lyra2Z_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNonces) {} +#endif + +#if __CUDA_ARCH__ > 500 + +#include "cuda_lyra2_vectors.h" +//#include "cuda_vector_uint2x4.h" + +#define Nrow 8 +#define Ncol 8 +#define memshift 3 + +#define BUF_COUNT 0 + +__device__ uint2 *DMatrix; + +__device__ __forceinline__ +void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads) +{ +#if BUF_COUNT != 8 + extern __shared__ uint2 shared_mem[]; + const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; +#endif +#if BUF_COUNT != 0 + const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; +#endif + +#if BUF_COUNT == 8 + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = *(DMatrix + d0 + j * threads * blockDim.x); +#elif BUF_COUNT == 0 + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +#else + if (row < BUF_COUNT) { + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = *(DMatrix + d0 + j * threads * blockDim.x); + } else { + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; + } +#endif +} + +__device__ __forceinline__ +void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads) +{ +#if BUF_COUNT != 8 + extern __shared__ uint2 shared_mem[]; + const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; +#endif +#if BUF_COUNT != 0 + const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; +#endif + +#if BUF_COUNT == 8 + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + d0 + j * threads * blockDim.x) = data[j]; + +#elif BUF_COUNT == 0 + #pragma unroll + for (int j = 0; j < 3; j++) + shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; +#else + if (row < BUF_COUNT) { + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + d0 + j * threads * blockDim.x) = data[j]; + } else { + #pragma unroll + for (int j = 0; j < 3; j++) + shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; + } +#endif +} + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ +void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} +#endif + +__device__ __forceinline__ void round_lyra(uint2 s[4]) +{ + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); +} + +static __device__ __forceinline__ +void round_lyra(uint2x4* s) +{ + Gfunc(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc(s[0].w, s[1].w, s[2].w, s[3].w); + Gfunc(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads) +{ + uint2 state1[3]; + +#if __CUDA_ARCH__ > 500 +#pragma unroll +#endif + for (int i = 0; i < Nrow; i++) + { + ST4S(0, Ncol - i - 1, state, thread, threads); + + round_lyra(state); + } + + #pragma unroll 4 + for (int i = 0; i < Nrow; i++) + { + LD4S(state1, 0, i, thread, threads); + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra(state); + + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + ST4S(1, Ncol - i - 1, state1, thread, threads); + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads) +{ + uint2 state1[3], state2[3]; + + #pragma unroll 1 + for (int i = 0; i < Nrow; i++) + { + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + ST4S(rowOut, Ncol - i - 1, state1, thread, threads); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + ST4S(rowInOut, i, state2, thread, threads); + } +} + +static __device__ __forceinline__ +void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + for (int i = 0; i < Nrow; i++) + { + uint2 state1[3], state2[3]; + + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + ST4S(rowInOut, i, state2, thread, threads); + + LD4S(state1, rowOut, i, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + ST4S(rowOut, i, state1, thread, threads); + } +} + +#if 0 +static __device__ __forceinline__ +void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads) +{ + uint2 state1[3], state2[3], last[3]; + + LD4S(state1, 2, 0, thread, threads); + LD4S(last, rowInOut, 0, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + last[j]; + + round_lyra(state); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == 5) + { + #pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (int i = 1; i < Nrow; i++) + { + LD4S(state1, 2, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + } + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; +} +#endif + +static __device__ __forceinline__ +void reduceDuplexRowt_8_v2(const int rowIn, const int rowOut, const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads) +{ + uint2 state1[3], state2[3], last[3]; + + LD4S(state1, rowIn, 0, thread, threads); + LD4S(last, rowInOut, 0, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + last[j]; + + round_lyra(state); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } + else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == rowOut) { + #pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (int i = 1; i < Nrow; i++) + { + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + } + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; +} + +__global__ +__launch_bounds__(64, 1) +void lyra2Z_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint2x4 Mask[2] = { + 0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, + 0x00000020UL, 0x00000000UL, 0x00000008UL, 0x00000000UL, + 0x00000008UL, 0x00000000UL, 0x00000008UL, 0x00000000UL, + 0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL + }; + const uint2x4 blake2b_IV[2] = { + 0xf3bcc908lu, 0x6a09e667lu, + 0x84caa73blu, 0xbb67ae85lu, + 0xfe94f82blu, 0x3c6ef372lu, + 0x5f1d36f1lu, 0xa54ff53alu, + 0xade682d1lu, 0x510e527flu, + 0x2b3e6c1flu, 0x9b05688clu, + 0xfb41bd6blu, 0x1f83d9ablu, + 0x137e2179lu, 0x5be0cd19lu + }; + if (thread < threads) + { + uint2x4 state[4]; + + state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i<12; i++) + round_lyra(state); + + state[0] ^= Mask[0]; + state[1] ^= Mask[1]; + + for (int i = 0; i<12; i++) + round_lyra(state); //because 12 is not enough + + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} + +__global__ +__launch_bounds__(TPB52, 1) +void lyra2Z_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; + + if (thread < threads) + { + uint2 state[4]; + state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]); + state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]); + state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]); + state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]); + + reduceDuplex(state, thread, threads); + reduceDuplexRowSetup(1, 0, 2, state, thread, threads); + reduceDuplexRowSetup(2, 1, 3, state, thread, threads); + reduceDuplexRowSetup(3, 0, 4, state, thread, threads); + reduceDuplexRowSetup(4, 3, 5, state, thread, threads); + reduceDuplexRowSetup(5, 2, 6, state, thread, threads); + reduceDuplexRowSetup(6, 1, 7, state, thread, threads); + + uint32_t rowa; // = WarpShuffle(state[0].x, 0, 4) & 7; + uint32_t prev = 7; + uint32_t iterator = 0; + + //for (uint32_t j=0;j<4;j++) { + + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator + 3) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator + 3) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator + 3) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator + 3) & 7; + } + + for (uint32_t i = 0; i<7; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + //} + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt_8_v2(prev,iterator,rowa, state, thread, threads); + + DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0]; + DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1]; + DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2]; + DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ +__launch_bounds__(64, 1) +void lyra2Z_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + uint28 state[4]; + + if (thread < threads) + { + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + uint32_t nonce = startNounce + thread; + if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) { + atomicMin(&resNonces[1], resNonces[0]); + atomicMin(&resNonces[0], nonce); + } +/* + g_hash[thread + threads * 0] = state[0].x; + g_hash[thread + threads * 1] = state[0].y; + g_hash[thread + threads * 2] = state[0].z; + g_hash[thread + threads * 3] = state[0].w; +*/ + } +} +#else +#if __CUDA_ARCH__ < 350 +__device__ void* DMatrix; +#endif +__global__ void lyra2Z_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2Z_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} +__global__ void lyra2Z_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) {} +#endif + +__host__ +void lyra2Z_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) +{ + // just assign the device pointer allocated in main loop + cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); + cudaMalloc(&d_GNonces[thr_id], 2 * sizeof(uint32_t)); + cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t)); +} + +__host__ +void lyra2Z_cpu_init_sm2(int thr_id, uint32_t threads) +{ + // just assign the device pointer allocated in main loop + cudaMalloc(&d_GNonces[thr_id], 2 * sizeof(uint32_t)); + cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t)); +} + +__host__ +uint32_t lyra2Z_getSecNonce(int thr_id, int num) +{ + uint32_t results[2]; + memset(results, 0xFF, sizeof(results)); + cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost); + if (results[1] == results[0]) + return UINT32_MAX; + return results[num]; +} + +__host__ +void lyra2Z_setTarget(const void *pTargetIn) +{ + cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice); +} + +__host__ +uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti) +{ + uint32_t result = UINT32_MAX; + cudaMemset(d_GNonces[thr_id], 0xff, 2 * sizeof(uint32_t)); + int dev_id = device_map[thr_id % MAX_GPUS]; + + uint32_t tpb = TPB52; + + if (device_sm[dev_id] == 500) + tpb = TPB50; + if (device_sm[dev_id] == 200) + tpb = TPB20; + + dim3 grid1((threads * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); + + dim3 grid2((threads + 64 - 1) / 64); + dim3 block2(64); + + dim3 grid3((threads + tpb - 1) / tpb); + dim3 block3(tpb); + + if (device_sm[dev_id] >= 520) + { + lyra2Z_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + + lyra2Z_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash); + + lyra2Z_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash, d_GNonces[thr_id]); + } + else if (device_sm[dev_id] == 500 || device_sm[dev_id] == 350) + { + size_t shared_mem = 0; + + if (gtx750ti) + // 8Warpに調整のため、8192バイト確保する + shared_mem = 8192; + else + // 10Warpに調整のため、6144バイト確保する + shared_mem = 6144; + + lyra2Z_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + + lyra2Z_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash); + + lyra2Z_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash, d_GNonces[thr_id]); + } + else + lyra2Z_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash, d_GNonces[thr_id]); + + // get first found nonce + cudaMemcpy(h_GNonces[thr_id], d_GNonces[thr_id], 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + result = *h_GNonces[thr_id]; + + return result; +} diff --git a/lyra2/cuda_lyra2Z_sm5.cuh b/lyra2/cuda_lyra2Z_sm5.cuh new file mode 100644 index 0000000..2a3bd5d --- /dev/null +++ b/lyra2/cuda_lyra2Z_sm5.cuh @@ -0,0 +1,819 @@ +#include + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +//#define __CUDA_ARCH__ 500 +#define __threadfence_block() +#define __ldg(x) *(x) +#define atomicMin(p,y) y +#endif + +#include "cuda_helper.h" + +#define TPB50 32 + +__constant__ uint32_t pTarget[8]; + +static __device__ __forceinline__ +void Gfunc(uint2 & a, uint2 &b, uint2 &c, uint2 &d) +{ +#if __CUDA_ARCH__ > 500 + a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y; + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); + c += d; b ^= c; b = ROR2(b, 63); +#else + a += b; d ^= a; d = SWAPUINT2(d); + c += d; b ^= c; b = ROR2(b, 24); + a += b; d ^= a; d = ROR2(d, 16); + c += d; b ^= c; b = ROR2(b, 63); +#endif +} + +#if __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 350 +#include "cuda_lyra2_vectors.h" + +#define Nrow 8 +#define Ncol 8 +#define memshift 3 + +__device__ uint2 *DMatrix; + +__device__ __forceinline__ uint2 LD4S(const int index) +{ + extern __shared__ uint2 shared_mem[]; + + return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +} + +__device__ __forceinline__ void ST4S(const int index, const uint2 data) +{ + extern __shared__ uint2 shared_mem[]; + + shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; +} + +#if __CUDA_ARCH__ == 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} +#else // != 300 + +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} + +#endif // != 300 + +__device__ __forceinline__ void round_lyra(uint2 s[4]) +{ + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); +} + +static __device__ __forceinline__ +void round_lyra(uint2x4* s) +{ + Gfunc(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc(s[0].w, s[1].w, s[2].w, s[3].w); + Gfunc(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + uint2 state1[3], state2[3]; + + const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x; + + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift; + #pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s0 + j, state[j]); + round_lyra(state); + } + + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = LD4S(s0 + j); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j]; + } + + // 1, 0, 2 + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s1 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(s0 + j); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s0 + j, state2[j]); + } + + // 2, 1, 3 + for (int i = 0; i < 8; i++) + { + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x; + const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s2 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s1 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state2[j]; + } + + // 3, 0, 4 + for (int i = 0; i < 8; i++) + { + const uint32_t ls0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x; + const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x; + const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s3 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(ls0 + j); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s0 + j*threads*blockDim.x) = state2[j]; + } + + // 4, 3, 5 + for (int i = 0; i < 8; i++) + { + const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x; + const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x; + const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s4 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s3 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s3 + j*threads*blockDim.x) = state2[j]; + } + + // 5, 2, 6 + for (int i = 0; i < 8; i++) + { + const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x; + const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x; + const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s5 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s2 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s2 + j*threads*blockDim.x) = state2[j]; + } + + // 6, 1, 7 + for (int i = 0; i < 8; i++) + { + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x; + const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s6 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s1 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x; + + #pragma unroll 1 + for (int i = 0; i < 8; i++) + { + uint2 state1[3], state2[3]; + + const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; + const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; + const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x; + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] = *(DMatrix + s1 + j*threads*blockDim.x); + state2[j] = *(DMatrix + s2 + j*threads*blockDim.x); + } + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] += state2[j]; + state[j] ^= state1[j]; + } + + round_lyra(state); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + { + *(DMatrix + s2 + j*threads*blockDim.x) = state2[j]; + *(DMatrix + s3 + j*threads*blockDim.x) ^= state[j]; + } + } +} + +static __device__ __forceinline__ +void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; + // const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x; + + uint2 state1[3], last[3]; + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x); + last[j] = *(DMatrix + ps2 + j*threads*blockDim.x); + } + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] += last[j]; + state[j] ^= state1[j]; + } + + round_lyra(state); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == 5) + { + #pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (int i = 1; i < 8; i++) + { + const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; + const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x); + + round_lyra(state); + } + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; +} + +static __device__ __forceinline__ +void reduceDuplexRowV50_8_v2(const int rowIn, const int rowOut,const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + const uint32_t ps1 = (memshift * Ncol * rowIn * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; + // const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x; + + uint2 state1[3], last[3]; + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x); + last[j] = *(DMatrix + ps2 + j*threads*blockDim.x); + } + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] += last[j]; + state[j] ^= state1[j]; + } + + round_lyra(state); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } + else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == rowOut) + { +#pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (int i = 1; i < 8; i++) + { + const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; + const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x); + + round_lyra(state); + } + + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; + +} + + +__global__ __launch_bounds__(64, 1) +void lyra2Z_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + const uint2x4 blake2b_IV[2] = { + { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, + { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } + }; + const uint2x4 Mask[2] = { + 0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, + 0x00000020UL, 0x00000000UL, 0x00000008UL, 0x00000000UL, + 0x00000008UL, 0x00000000UL, 0x00000008UL, 0x00000000UL, + 0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL + }; + if (thread < threads) + { + uint2x4 state[4]; + + ((uint2*)state)[0] = __ldg(&g_hash[thread]); + ((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); + ((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]); + ((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]); + + state[1] = state[0]; + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i < 12; i++) + round_lyra(state); //because 12 is not enough + + state[0] ^= Mask[0]; + state[1] ^= Mask[1]; + + for (int i = 0; i < 12; i++) + round_lyra(state); //because 12 is not enough + + + ((uint2x4*)DMatrix)[0 * threads + thread] = state[0]; + ((uint2x4*)DMatrix)[1 * threads + thread] = state[1]; + ((uint2x4*)DMatrix)[2 * threads + thread] = state[2]; + ((uint2x4*)DMatrix)[3 * threads + thread] = state[3]; + } +} + +__global__ __launch_bounds__(TPB50, 1) +void lyra2Z_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); + + if (thread < threads) + { + uint2 state[4]; + + state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]); + state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]); + state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]); + state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]); + + reduceDuplexV5(state, thread, threads); + + uint32_t rowa; // = WarpShuffle(state[0].x, 0, 4) & 7; + uint32_t prev = 7; + uint32_t iterator = 0; + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator + 3) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator - 1) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator + 3) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator - 1) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator + 3) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator - 1) & 7; + } + for (uint32_t i = 0; i<8; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator + 3) & 7; + } + for (uint32_t i = 0; i<7; i++) { + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); + prev = iterator; + iterator = (iterator - 1) & 7; + } + + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50_8_v2(prev,iterator,rowa, state, thread, threads); + + DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0]; + DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1]; + DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2]; + DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2Z_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads) + { + uint2x4 state[4]; + + state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + uint32_t nonce = startNounce + thread; + if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) { + atomicMin(&resNonces[1], resNonces[0]); + atomicMin(&resNonces[0], nonce); + } + } +} + +#else +/* if __CUDA_ARCH__ != 500 .. host */ +__global__ void lyra2Z_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2Z_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2Z_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) {} +#endif diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h index d69efa2..63e3fcb 100644 --- a/lyra2/cuda_lyra2_vectors.h +++ b/lyra2/cuda_lyra2_vectors.h @@ -36,11 +36,11 @@ typedef struct __align__(128) ulonglong8to16 { ulonglong2to8 lo, hi; } ulonglong8to16; -typedef struct __align__(256) ulonglong16to32 { +typedef struct __align__(128) ulonglong16to32{ ulonglong8to16 lo, hi; } ulonglong16to32; -typedef struct __align__(512) ulonglong32to64 { +typedef struct __align__(128) ulonglong32to64{ ulonglong16to32 lo, hi; } ulonglong32to64; @@ -79,7 +79,7 @@ struct __align__(128) ulong8 { }; typedef __device_builtin__ struct ulong8 ulong8; -typedef struct __align__(256) ulonglong16 { +typedef struct __align__(128) ulonglong16{ ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7; } ulonglong16; @@ -92,7 +92,7 @@ typedef struct __builtin_align__(32) uint48 { uint4 s0,s1; } uint48; -typedef struct __align__(256) uint4x16 { +typedef struct __builtin_align__(128) uint4x16{ uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; } uint4x16; diff --git a/lyra2/lyra2Z.cu b/lyra2/lyra2Z.cu new file mode 100644 index 0000000..42799af --- /dev/null +++ b/lyra2/lyra2Z.cu @@ -0,0 +1,164 @@ +extern "C" { +#include +#include "Lyra2Z.h" +} + +#include +#include + +static uint64_t* d_hash[MAX_GPUS]; +static uint64_t* d_matrix[MAX_GPUS]; + +extern void blake256_cpu_init(int thr_id, uint32_t threads); +extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); +extern void blake256_cpu_setBlock_80(uint32_t *pdata); + +extern void lyra2Z_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); +extern void lyra2Z_cpu_init_sm2(int thr_id, uint32_t threads); +extern uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); + +extern void lyra2Z_setTarget(const void *ptarget); +extern uint32_t lyra2Z_getSecNonce(int thr_id, int num); + +extern "C" void lyra2Z_hash(void *state, const void *input) +{ + uint32_t _ALIGN(64) hashA[8], hashB[8]; + sph_blake256_context ctx_blake; + + sph_blake256_set_rounds(14); + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, input, 80); + sph_blake256_close(&ctx_blake, hashA); + + LYRA2Z(hashB, 32, hashA, 32, hashA, 32, 8, 8, 8); + + memcpy(state, hashB, 32); +} + +static bool init[MAX_GPUS] = { 0 }; +static __thread uint32_t throughput = 0; +static __thread bool gtx750ti = false; + +extern "C" int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t _ALIGN(64) endiandata[20]; + const uint32_t first_nonce = pdata[19]; + int dev_id = device_map[thr_id]; + + if (opt_benchmark) + ptarget[7] = 0x00ff; + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; + if (device_sm[dev_id] <= 500) intensity = 15; + throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + cudaDeviceProp props; + cudaGetDeviceProperties(&props, dev_id); + gtx750ti = (strstr(props.name, "750 Ti") != NULL); + + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + blake256_cpu_init(thr_id, throughput); + + if (device_sm[dev_id] >= 350) + { + size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); + lyra2Z_cpu_init(thr_id, throughput, d_matrix[thr_id]); + } + else + lyra2Z_cpu_init_sm2(thr_id, throughput); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + + init[thr_id] = true; + } + + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + blake256_cpu_setBlock_80(pdata); + lyra2Z_setTarget(ptarget); + + do { + int order = 0; + + blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = lyra2Z_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); + + if (work->nonces[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; + + be32enc(&endiandata[19], work->nonces[0]); + lyra2Z_hash(vhash, endiandata); + + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = lyra2Z_getSecNonce(thr_id, 1); + work_set_target_ratio(work, vhash); + pdata[19] = work->nonces[0] + 1; + if (work->nonces[1] != UINT32_MAX) + { + be32enc(&endiandata[19], work->nonces[1]); + lyra2Z_hash(vhash, endiandata); + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + } + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > ptarget[7]) { + gpu_increment_reject(thr_id); + if (!opt_quiet) gpulog(LOG_WARNING, thr_id, + "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0]; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_lyra2Z(int thr_id) +{ + int dev_id = device_map[thr_id]; + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + if (device_sm[dev_id] >= 350) + cudaFree(d_matrix[thr_id]); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/miner.h b/miner.h index 4902771..53d2f70 100644 --- a/miner.h +++ b/miner.h @@ -292,6 +292,7 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -344,6 +345,7 @@ extern void free_lbry(int thr_id); extern void free_luffa(int thr_id); extern void free_lyra2(int thr_id); extern void free_lyra2v2(int thr_id); +extern void free_lyra2Z(int thr_id); extern void free_myriad(int thr_id); extern void free_neoscrypt(int thr_id); extern void free_nist5(int thr_id); @@ -863,6 +865,7 @@ void groestlhash(void *state, const void *input); void lbry_hash(void *output, const void *input); void lyra2re_hash(void *state, const void *input); void lyra2v2_hash(void *state, const void *input); +void lyra2Z_hash(void *state, const void *input); void myriadhash(void *state, const void *input); void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void nist5hash(void *state, const void *input); diff --git a/util.cpp b/util.cpp index 825f0c8..cb26559 100644 --- a/util.cpp +++ b/util.cpp @@ -2201,6 +2201,9 @@ void print_hash_tests(void) lyra2v2_hash(&hash[0], &buf[0]); printpfx("lyra2v2", hash); + lyra2Z_hash(&hash[0], &buf[0]); + printpfx("lyra2z", hash); + myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash);