Tanguy Pruvot
8 years ago
14 changed files with 2252 additions and 13 deletions
@ -0,0 +1,215 @@
@@ -0,0 +1,215 @@
|
||||
/**
|
||||
* Implementation of the Lyra2 Password Hashing Scheme (PHS). |
||||
* |
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
* |
||||
* This software is hereby placed in the public domain. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS |
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||
*/ |
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <string.h> |
||||
#include <time.h> |
||||
|
||||
#include "Lyra2Z.h" |
||||
#include "Sponge.h" |
||||
|
||||
/**
|
||||
* Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords |
||||
* whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits, |
||||
* where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all |
||||
* integer parameters (treated as type "unsigned int") in the order they are provided, plus the value |
||||
* of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols). |
||||
* |
||||
* @param K The derived key to be output by the algorithm |
||||
* @param kLen Desired key length |
||||
* @param pwd User password |
||||
* @param pwdlen Password length |
||||
* @param salt Salt |
||||
* @param saltlen Salt length |
||||
* @param timeCost Parameter to determine the processing time (T) |
||||
* @param nRows Number or rows of the memory matrix (R) |
||||
* @param nCols Number of columns of the memory matrix (C) |
||||
* |
||||
* @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation) |
||||
*/ |
||||
int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols) |
||||
{ |
||||
//============================= Basic variables ============================//
|
||||
int64_t row = 2; //index of row to be processed
|
||||
int64_t prev = 1; //index of prev (last row ever computed/modified)
|
||||
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
|
||||
int64_t tau; //Time Loop iterator
|
||||
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
|
||||
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
|
||||
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
|
||||
int64_t i; //auxiliary iteration counter
|
||||
int64_t v64; // 64bit var for memcpy
|
||||
//==========================================================================/
|
||||
|
||||
//========== Initializing the Memory Matrix and pointers to it =============//
|
||||
//Tries to allocate enough space for the whole memory matrix
|
||||
|
||||
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; |
||||
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; |
||||
// for Lyra2REv2, nCols = 4, v1 was using 8
|
||||
const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64; |
||||
|
||||
size_t sz = (size_t)ROW_LEN_BYTES * nRows; |
||||
uint64_t *wholeMatrix = malloc(sz); |
||||
if (wholeMatrix == NULL) { |
||||
return -1; |
||||
} |
||||
memset(wholeMatrix, 0, sz); |
||||
|
||||
//Allocates pointers to each row of the matrix
|
||||
uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows); |
||||
if (memMatrix == NULL) { |
||||
return -1; |
||||
} |
||||
//Places the pointers in the correct positions
|
||||
uint64_t *ptrWord = wholeMatrix; |
||||
for (i = 0; i < nRows; i++) { |
||||
memMatrix[i] = ptrWord; |
||||
ptrWord += ROW_LEN_INT64; |
||||
} |
||||
//==========================================================================/
|
||||
|
||||
//============= Getting the password + salt + basil padded with 10*1 ===============//
|
||||
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
|
||||
//but this ensures that the password copied locally will be overwritten as soon as possible
|
||||
|
||||
//First, we clean enough blocks for the password, salt, basil and padding
|
||||
int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; |
||||
|
||||
byte *ptrByte = (byte*) wholeMatrix; |
||||
|
||||
//Prepends the password
|
||||
memcpy(ptrByte, pwd, pwdlen); |
||||
ptrByte += pwdlen; |
||||
|
||||
//Concatenates the salt
|
||||
memcpy(ptrByte, salt, saltlen); |
||||
ptrByte += saltlen; |
||||
|
||||
memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen))); |
||||
|
||||
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
|
||||
memcpy(ptrByte, &kLen, sizeof(int64_t)); |
||||
ptrByte += sizeof(uint64_t); |
||||
v64 = pwdlen; |
||||
memcpy(ptrByte, &v64, sizeof(int64_t)); |
||||
ptrByte += sizeof(uint64_t); |
||||
v64 = saltlen; |
||||
memcpy(ptrByte, &v64, sizeof(int64_t)); |
||||
ptrByte += sizeof(uint64_t); |
||||
v64 = timeCost; |
||||
memcpy(ptrByte, &v64, sizeof(int64_t)); |
||||
ptrByte += sizeof(uint64_t); |
||||
v64 = nRows; |
||||
memcpy(ptrByte, &v64, sizeof(int64_t)); |
||||
ptrByte += sizeof(uint64_t); |
||||
v64 = nCols; |
||||
memcpy(ptrByte, &v64, sizeof(int64_t)); |
||||
ptrByte += sizeof(uint64_t); |
||||
|
||||
//Now comes the padding
|
||||
*ptrByte = 0x80; //first byte of padding: right after the password
|
||||
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
|
||||
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
|
||||
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
|
||||
//==========================================================================/
|
||||
|
||||
//======================= Initializing the Sponge State ====================//
|
||||
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
|
||||
uint64_t state[16]; |
||||
initState(state); |
||||
//==========================================================================/
|
||||
|
||||
//================================ Setup Phase =============================//
|
||||
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
|
||||
ptrWord = wholeMatrix; |
||||
for (i = 0; i < nBlocksInput; i++) { |
||||
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
|
||||
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
|
||||
} |
||||
|
||||
//Initializes M[0] and M[1]
|
||||
reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
|
||||
|
||||
reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); |
||||
|
||||
do { |
||||
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
|
||||
|
||||
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); |
||||
|
||||
//updates the value of row* (deterministically picked during Setup))
|
||||
rowa = (rowa + step) & (window - 1); |
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row; |
||||
//updates row: goes to the next row to be computed
|
||||
row++; |
||||
|
||||
//Checks if all rows in the window where visited.
|
||||
if (rowa == 0) { |
||||
step = window + gap; //changes the step: approximately doubles its value
|
||||
window *= 2; //doubles the size of the re-visitation window
|
||||
gap = -gap; //inverts the modifier to the step
|
||||
} |
||||
|
||||
} while (row < nRows); |
||||
//==========================================================================/
|
||||
|
||||
//============================ Wandering Phase =============================//
|
||||
row = 0; //Resets the visitation to the first row of the memory matrix
|
||||
for (tau = 1; tau <= timeCost; tau++) { |
||||
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
|
||||
step = (tau % 2 == 0) ? -1 : nRows / 2 - 1; |
||||
do { |
||||
//Selects a pseudorandom index row*
|
||||
//------------------------------------------------------------------------------------------
|
||||
rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//------------------------------------------------------------------------------------------
|
||||
|
||||
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
|
||||
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); |
||||
|
||||
//update prev: it now points to the last row ever computed
|
||||
prev = row; |
||||
|
||||
//updates row: goes to the next row to be computed
|
||||
//------------------------------------------------------------------------------------------
|
||||
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
|
||||
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
|
||||
//------------------------------------------------------------------------------------------
|
||||
|
||||
} while (row != 0); |
||||
} |
||||
|
||||
//============================ Wrap-up Phase ===============================//
|
||||
//Absorbs the last block of the memory matrix
|
||||
absorbBlock(state, memMatrix[rowa]); |
||||
|
||||
//Squeezes the key
|
||||
squeeze(state, K, (unsigned int) kLen); |
||||
|
||||
//========================= Freeing the memory =============================//
|
||||
free(memMatrix); |
||||
free(wholeMatrix); |
||||
|
||||
return 0; |
||||
} |
||||
|
@ -0,0 +1,42 @@
@@ -0,0 +1,42 @@
|
||||
/**
|
||||
* Header file for the Lyra2 Password Hashing Scheme (PHS). |
||||
* |
||||
* Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
|
||||
* |
||||
* This software is hereby placed in the public domain. |
||||
* |
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS |
||||
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE |
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
||||
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
||||
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, |
||||
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||||
*/ |
||||
#ifndef LYRA2Z_H_ |
||||
#define LYRA2Z_H_ |
||||
|
||||
#include <stdint.h> |
||||
|
||||
typedef unsigned char byte; |
||||
|
||||
//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
|
||||
#define BLOCK_LEN_BLAKE2_SAFE_INT64 8 //512 bits (=64 bytes, =8 uint64_t)
|
||||
#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8) //same as above, in bytes
|
||||
|
||||
|
||||
#ifdef BLOCK_LEN_BITS |
||||
#define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64) //Block length: 768 bits (=96 bytes, =12 uint64_t)
|
||||
#define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8) //Block length, in bytes
|
||||
#else //default block lenght: 768 bits
|
||||
#define BLOCK_LEN_INT64 12 //Block length: 768 bits (=96 bytes, =12 uint64_t)
|
||||
#define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8) //Block length, in bytes
|
||||
#endif |
||||
|
||||
int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); |
||||
|
||||
#endif /* LYRA2_H_ */ |
@ -0,0 +1,966 @@
@@ -0,0 +1,966 @@
|
||||
/** |
||||
* Lyra2 (v1) cuda implementation based on djm34 work |
||||
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2) |
||||
* Lyra2Z implentation for Zcoin based on all the previous |
||||
* djm34 2017 |
||||
**/ |
||||
|
||||
#include <stdio.h> |
||||
#include <memory.h> |
||||
|
||||
#define TPB52 32 |
||||
#define TPB30 160 |
||||
#define TPB20 160 |
||||
|
||||
#include "cuda_lyra2Z_sm5.cuh" |
||||
|
||||
#ifdef __INTELLISENSE__ |
||||
/* just for vstudio code colors */ |
||||
__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c); |
||||
#define atomicMin() |
||||
#define __CUDA_ARCH__ 520 |
||||
#endif |
||||
|
||||
static uint32_t *h_GNonces[16]; // this need to get fixed as the rest of that routine |
||||
static uint32_t *d_GNonces[16]; |
||||
|
||||
#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ |
||||
for (int i = 0; i < 8; i++) { \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ |
||||
round_lyra_sm2(state); \ |
||||
for (int j = 0; j < 12; j++) \ |
||||
Matrix[j + 12 * i][rowOut] ^= state[j]; \ |
||||
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \ |
||||
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \ |
||||
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \ |
||||
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \ |
||||
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \ |
||||
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \ |
||||
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \ |
||||
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \ |
||||
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \ |
||||
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \ |
||||
Matrix[10+ 12 * i][rowInOut] ^= state[9]; \ |
||||
Matrix[11+ 12 * i][rowInOut] ^= state[10]; \ |
||||
} \ |
||||
} |
||||
|
||||
#define absorbblock(in) { \ |
||||
state[0] ^= Matrix[0][in]; \ |
||||
state[1] ^= Matrix[1][in]; \ |
||||
state[2] ^= Matrix[2][in]; \ |
||||
state[3] ^= Matrix[3][in]; \ |
||||
state[4] ^= Matrix[4][in]; \ |
||||
state[5] ^= Matrix[5][in]; \ |
||||
state[6] ^= Matrix[6][in]; \ |
||||
state[7] ^= Matrix[7][in]; \ |
||||
state[8] ^= Matrix[8][in]; \ |
||||
state[9] ^= Matrix[9][in]; \ |
||||
state[10] ^= Matrix[10][in]; \ |
||||
state[11] ^= Matrix[11][in]; \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
round_lyra_sm2(state); \ |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
static void round_lyra_sm2(uint2 *s) |
||||
{ |
||||
Gfunc(s[0], s[4], s[8], s[12]); |
||||
Gfunc(s[1], s[5], s[9], s[13]); |
||||
Gfunc(s[2], s[6], s[10], s[14]); |
||||
Gfunc(s[3], s[7], s[11], s[15]); |
||||
Gfunc(s[0], s[5], s[10], s[15]); |
||||
Gfunc(s[1], s[6], s[11], s[12]); |
||||
Gfunc(s[2], s[7], s[8], s[13]); |
||||
Gfunc(s[3], s[4], s[9], s[14]); |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8]) |
||||
{ |
||||
#if __CUDA_ARCH__ > 500 |
||||
#pragma unroll |
||||
#endif |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
#pragma unroll |
||||
for (int j = 0; j < 12; j++) |
||||
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; |
||||
|
||||
round_lyra_sm2(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 12; j++) |
||||
Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; |
||||
|
||||
Matrix[0 + 12 * i][rowInOut] ^= state[11]; |
||||
Matrix[1 + 12 * i][rowInOut] ^= state[0]; |
||||
Matrix[2 + 12 * i][rowInOut] ^= state[1]; |
||||
Matrix[3 + 12 * i][rowInOut] ^= state[2]; |
||||
Matrix[4 + 12 * i][rowInOut] ^= state[3]; |
||||
Matrix[5 + 12 * i][rowInOut] ^= state[4]; |
||||
Matrix[6 + 12 * i][rowInOut] ^= state[5]; |
||||
Matrix[7 + 12 * i][rowInOut] ^= state[6]; |
||||
Matrix[8 + 12 * i][rowInOut] ^= state[7]; |
||||
Matrix[9 + 12 * i][rowInOut] ^= state[8]; |
||||
Matrix[10 + 12 * i][rowInOut] ^= state[9]; |
||||
Matrix[11 + 12 * i][rowInOut] ^= state[10]; |
||||
} |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ < 350 |
||||
|
||||
__constant__ static uint2 blake2b_IV_sm2[8] = { |
||||
{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, |
||||
{ 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }, |
||||
{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, |
||||
{ 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } |
||||
}; |
||||
|
||||
__global__ __launch_bounds__(TPB30, 1) |
||||
void lyra2Z_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNonces) |
||||
{ |
||||
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
const uint2 Mask[8] = { |
||||
{ 0x00000020, 0x00000000 },{ 0x00000020, 0x00000000 }, |
||||
{ 0x00000020, 0x00000000 },{ 0x00000008, 0x00000000 }, |
||||
{ 0x00000008, 0x00000000 },{ 0x00000008, 0x00000000 }, |
||||
{ 0x00000080, 0x00000000 },{ 0x00000000, 0x01000000 } |
||||
}; |
||||
if (thread < threads) |
||||
{ |
||||
uint2 state[16]; |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { |
||||
LOHI(state[i].x, state[i].y, g_hash[threads*i + thread]); |
||||
} //password |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i<4; i++) { |
||||
state[i + 4] = state[i]; |
||||
} //salt |
||||
|
||||
#pragma unroll |
||||
for (int i = 0; i<8; i++) { |
||||
state[i + 8] = blake2b_IV_sm2[i]; |
||||
} |
||||
|
||||
// blake2blyra x2 |
||||
//#pragma unroll 24 |
||||
for (int i = 0; i<12; i++) { |
||||
round_lyra_sm2(state); |
||||
} |
||||
|
||||
for (int i = 0; i<8; i++) |
||||
state[i] ^= Mask[i]; |
||||
|
||||
for (int i = 0; i<12; i++) { |
||||
round_lyra_sm2(state); |
||||
} |
||||
|
||||
uint2 Matrix[96][8]; // not cool |
||||
|
||||
// reducedSqueezeRow0 |
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { |
||||
Matrix[j + 84 - 12 * i][0] = state[j]; |
||||
} |
||||
round_lyra_sm2(state); |
||||
} |
||||
|
||||
// reducedSqueezeRow1 |
||||
#pragma unroll 8 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { |
||||
state[j] ^= Matrix[j + 12 * i][0]; |
||||
} |
||||
round_lyra_sm2(state); |
||||
#pragma unroll 12 |
||||
for (int j = 0; j<12; j++) { |
||||
Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; |
||||
} |
||||
} |
||||
|
||||
reduceDuplexRowSetup(1, 0, 2, state, Matrix); |
||||
reduceDuplexRowSetup(2, 1, 3, state, Matrix); |
||||
reduceDuplexRowSetup(3, 0, 4, state, Matrix); |
||||
reduceDuplexRowSetup(4, 3, 5, state, Matrix); |
||||
reduceDuplexRowSetup(5, 2, 6, state, Matrix); |
||||
reduceDuplexRowSetup(6, 1, 7, state, Matrix); |
||||
|
||||
uint32_t rowa; |
||||
uint32_t prev = 7; |
||||
uint32_t iterator = 0; |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(prev, rowa, iterator); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(prev, rowa, iterator); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(prev, rowa, iterator); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(prev, rowa, iterator); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(prev, rowa, iterator); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(prev, rowa, iterator); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(prev, rowa, iterator); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = state[0].x & 7; |
||||
reduceDuplexRow(prev, rowa, iterator); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
absorbblock(rowa); |
||||
uint32_t nonce = startNounce + thread; |
||||
if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) { |
||||
atomicMin(&resNonces[1], resNonces[0]); |
||||
atomicMin(&resNonces[0], nonce); |
||||
} |
||||
} //thread |
||||
} |
||||
#else |
||||
__global__ void lyra2Z_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNonces) {} |
||||
#endif |
||||
|
||||
#if __CUDA_ARCH__ > 500 |
||||
|
||||
#include "cuda_lyra2_vectors.h" |
||||
//#include "cuda_vector_uint2x4.h" |
||||
|
||||
#define Nrow 8 |
||||
#define Ncol 8 |
||||
#define memshift 3 |
||||
|
||||
#define BUF_COUNT 0 |
||||
|
||||
__device__ uint2 *DMatrix; |
||||
|
||||
__device__ __forceinline__ |
||||
void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads) |
||||
{ |
||||
#if BUF_COUNT != 8 |
||||
extern __shared__ uint2 shared_mem[]; |
||||
const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; |
||||
#endif |
||||
#if BUF_COUNT != 0 |
||||
const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; |
||||
#endif |
||||
|
||||
#if BUF_COUNT == 8 |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
res[j] = *(DMatrix + d0 + j * threads * blockDim.x); |
||||
#elif BUF_COUNT == 0 |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; |
||||
#else |
||||
if (row < BUF_COUNT) { |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
res[j] = *(DMatrix + d0 + j * threads * blockDim.x); |
||||
} else { |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; |
||||
} |
||||
#endif |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads) |
||||
{ |
||||
#if BUF_COUNT != 8 |
||||
extern __shared__ uint2 shared_mem[]; |
||||
const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; |
||||
#endif |
||||
#if BUF_COUNT != 0 |
||||
const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; |
||||
#endif |
||||
|
||||
#if BUF_COUNT == 8 |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + d0 + j * threads * blockDim.x) = data[j]; |
||||
|
||||
#elif BUF_COUNT == 0 |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; |
||||
#else |
||||
if (row < BUF_COUNT) { |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + d0 + j * threads * blockDim.x) = data[j]; |
||||
} else { |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; |
||||
} |
||||
#endif |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ >= 300 |
||||
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) |
||||
{ |
||||
return __shfl(a, b, c); |
||||
} |
||||
|
||||
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) |
||||
{ |
||||
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); |
||||
} |
||||
|
||||
__device__ __forceinline__ |
||||
void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) |
||||
{ |
||||
a1 = WarpShuffle(a1, b1, c); |
||||
a2 = WarpShuffle(a2, b2, c); |
||||
a3 = WarpShuffle(a3, b3, c); |
||||
} |
||||
|
||||
#else |
||||
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; |
||||
uint32_t *_ptr = (uint32_t*)shared_mem; |
||||
|
||||
__threadfence_block(); |
||||
uint32_t buf = _ptr[thread]; |
||||
|
||||
_ptr[thread] = a; |
||||
__threadfence_block(); |
||||
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; |
||||
|
||||
__threadfence_block(); |
||||
_ptr[thread] = buf; |
||||
|
||||
__threadfence_block(); |
||||
return result; |
||||
} |
||||
|
||||
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; |
||||
|
||||
__threadfence_block(); |
||||
uint2 buf = shared_mem[thread]; |
||||
|
||||
shared_mem[thread] = a; |
||||
__threadfence_block(); |
||||
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; |
||||
|
||||
__threadfence_block(); |
||||
shared_mem[thread] = buf; |
||||
|
||||
__threadfence_block(); |
||||
return result; |
||||
} |
||||
|
||||
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; |
||||
|
||||
__threadfence_block(); |
||||
uint2 buf = shared_mem[thread]; |
||||
|
||||
shared_mem[thread] = a1; |
||||
__threadfence_block(); |
||||
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; |
||||
__threadfence_block(); |
||||
shared_mem[thread] = a2; |
||||
__threadfence_block(); |
||||
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; |
||||
__threadfence_block(); |
||||
shared_mem[thread] = a3; |
||||
__threadfence_block(); |
||||
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; |
||||
|
||||
__threadfence_block(); |
||||
shared_mem[thread] = buf; |
||||
__threadfence_block(); |
||||
} |
||||
#endif |
||||
|
||||
__device__ __forceinline__ void round_lyra(uint2 s[4]) |
||||
{ |
||||
Gfunc(s[0], s[1], s[2], s[3]); |
||||
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); |
||||
Gfunc(s[0], s[1], s[2], s[3]); |
||||
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void round_lyra(uint2x4* s) |
||||
{ |
||||
Gfunc(s[0].x, s[1].x, s[2].x, s[3].x); |
||||
Gfunc(s[0].y, s[1].y, s[2].y, s[3].y); |
||||
Gfunc(s[0].z, s[1].z, s[2].z, s[3].z); |
||||
Gfunc(s[0].w, s[1].w, s[2].w, s[3].w); |
||||
Gfunc(s[0].x, s[1].y, s[2].z, s[3].w); |
||||
Gfunc(s[0].y, s[1].z, s[2].w, s[3].x); |
||||
Gfunc(s[0].z, s[1].w, s[2].x, s[3].y); |
||||
Gfunc(s[0].w, s[1].x, s[2].y, s[3].z); |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads) |
||||
{ |
||||
uint2 state1[3]; |
||||
|
||||
#if __CUDA_ARCH__ > 500 |
||||
#pragma unroll |
||||
#endif |
||||
for (int i = 0; i < Nrow; i++) |
||||
{ |
||||
ST4S(0, Ncol - i - 1, state, thread, threads); |
||||
|
||||
round_lyra(state); |
||||
} |
||||
|
||||
#pragma unroll 4 |
||||
for (int i = 0; i < Nrow; i++) |
||||
{ |
||||
LD4S(state1, 0, i, thread, threads); |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] ^= state[j]; |
||||
ST4S(1, Ncol - i - 1, state1, thread, threads); |
||||
} |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads) |
||||
{ |
||||
uint2 state1[3], state2[3]; |
||||
|
||||
#pragma unroll 1 |
||||
for (int i = 0; i < Nrow; i++) |
||||
{ |
||||
LD4S(state1, rowIn, i, thread, threads); |
||||
LD4S(state2, rowInOut, i, thread, threads); |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] ^= state[j]; |
||||
|
||||
ST4S(rowOut, Ncol - i - 1, state1, thread, threads); |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} else { |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
ST4S(rowInOut, i, state2, thread, threads); |
||||
} |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) |
||||
{ |
||||
for (int i = 0; i < Nrow; i++) |
||||
{ |
||||
uint2 state1[3], state2[3]; |
||||
|
||||
LD4S(state1, rowIn, i, thread, threads); |
||||
LD4S(state2, rowInOut, i, thread, threads); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} |
||||
else |
||||
{ |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
ST4S(rowInOut, i, state2, thread, threads); |
||||
|
||||
LD4S(state1, rowOut, i, thread, threads); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] ^= state[j]; |
||||
|
||||
ST4S(rowOut, i, state1, thread, threads); |
||||
} |
||||
} |
||||
|
||||
#if 0 |
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads) |
||||
{ |
||||
uint2 state1[3], state2[3], last[3]; |
||||
|
||||
LD4S(state1, 2, 0, thread, threads); |
||||
LD4S(last, rowInOut, 0, thread, threads); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + last[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
last[0] ^= Data2; |
||||
last[1] ^= Data0; |
||||
last[2] ^= Data1; |
||||
} else { |
||||
last[0] ^= Data0; |
||||
last[1] ^= Data1; |
||||
last[2] ^= Data2; |
||||
} |
||||
|
||||
if (rowInOut == 5) |
||||
{ |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
last[j] ^= state[j]; |
||||
} |
||||
|
||||
for (int i = 1; i < Nrow; i++) |
||||
{ |
||||
LD4S(state1, 2, i, thread, threads); |
||||
LD4S(state2, rowInOut, i, thread, threads); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= last[j]; |
||||
} |
||||
#endif |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowt_8_v2(const int rowIn, const int rowOut, const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads) |
||||
{ |
||||
uint2 state1[3], state2[3], last[3]; |
||||
|
||||
LD4S(state1, rowIn, 0, thread, threads); |
||||
LD4S(last, rowInOut, 0, thread, threads); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + last[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
last[0] ^= Data2; |
||||
last[1] ^= Data0; |
||||
last[2] ^= Data1; |
||||
} |
||||
else { |
||||
last[0] ^= Data0; |
||||
last[1] ^= Data1; |
||||
last[2] ^= Data2; |
||||
} |
||||
|
||||
if (rowInOut == rowOut) { |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
last[j] ^= state[j]; |
||||
} |
||||
|
||||
for (int i = 1; i < Nrow; i++) |
||||
{ |
||||
LD4S(state1, rowIn, i, thread, threads); |
||||
LD4S(state2, rowInOut, i, thread, threads); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= last[j]; |
||||
} |
||||
|
||||
__global__ |
||||
__launch_bounds__(64, 1) |
||||
void lyra2Z_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) |
||||
{ |
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
const uint2x4 Mask[2] = { |
||||
0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, |
||||
0x00000020UL, 0x00000000UL, 0x00000008UL, 0x00000000UL, |
||||
0x00000008UL, 0x00000000UL, 0x00000008UL, 0x00000000UL, |
||||
0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL |
||||
}; |
||||
const uint2x4 blake2b_IV[2] = { |
||||
0xf3bcc908lu, 0x6a09e667lu, |
||||
0x84caa73blu, 0xbb67ae85lu, |
||||
0xfe94f82blu, 0x3c6ef372lu, |
||||
0x5f1d36f1lu, 0xa54ff53alu, |
||||
0xade682d1lu, 0x510e527flu, |
||||
0x2b3e6c1flu, 0x9b05688clu, |
||||
0xfb41bd6blu, 0x1f83d9ablu, |
||||
0x137e2179lu, 0x5be0cd19lu |
||||
}; |
||||
if (thread < threads) |
||||
{ |
||||
uint2x4 state[4]; |
||||
|
||||
state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); |
||||
state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); |
||||
state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); |
||||
state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]); |
||||
state[2] = blake2b_IV[0]; |
||||
state[3] = blake2b_IV[1]; |
||||
|
||||
for (int i = 0; i<12; i++) |
||||
round_lyra(state); |
||||
|
||||
state[0] ^= Mask[0]; |
||||
state[1] ^= Mask[1]; |
||||
|
||||
for (int i = 0; i<12; i++) |
||||
round_lyra(state); //because 12 is not enough |
||||
|
||||
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; |
||||
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; |
||||
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; |
||||
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; |
||||
} |
||||
} |
||||
|
||||
__global__ |
||||
__launch_bounds__(TPB52, 1) |
||||
void lyra2Z_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) |
||||
{ |
||||
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
uint2 state[4]; |
||||
state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]); |
||||
state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]); |
||||
state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]); |
||||
state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]); |
||||
|
||||
reduceDuplex(state, thread, threads); |
||||
reduceDuplexRowSetup(1, 0, 2, state, thread, threads); |
||||
reduceDuplexRowSetup(2, 1, 3, state, thread, threads); |
||||
reduceDuplexRowSetup(3, 0, 4, state, thread, threads); |
||||
reduceDuplexRowSetup(4, 3, 5, state, thread, threads); |
||||
reduceDuplexRowSetup(5, 2, 6, state, thread, threads); |
||||
reduceDuplexRowSetup(6, 1, 7, state, thread, threads); |
||||
|
||||
uint32_t rowa; // = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
uint32_t prev = 7; |
||||
uint32_t iterator = 0; |
||||
|
||||
//for (uint32_t j=0;j<4;j++) { |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
|
||||
for (uint32_t i = 0; i<7; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
//} |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowt_8_v2(prev,iterator,rowa, state, thread, threads); |
||||
|
||||
DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0]; |
||||
DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1]; |
||||
DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2]; |
||||
DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3]; |
||||
} |
||||
} |
||||
|
||||
__global__ |
||||
__launch_bounds__(64, 1) |
||||
void lyra2Z_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) |
||||
{ |
||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; |
||||
|
||||
uint28 state[4]; |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); |
||||
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); |
||||
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); |
||||
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); |
||||
|
||||
for (int i = 0; i < 12; i++) |
||||
round_lyra(state); |
||||
|
||||
uint32_t nonce = startNounce + thread; |
||||
if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) { |
||||
atomicMin(&resNonces[1], resNonces[0]); |
||||
atomicMin(&resNonces[0], nonce); |
||||
} |
||||
/* |
||||
g_hash[thread + threads * 0] = state[0].x; |
||||
g_hash[thread + threads * 1] = state[0].y; |
||||
g_hash[thread + threads * 2] = state[0].z; |
||||
g_hash[thread + threads * 3] = state[0].w; |
||||
*/ |
||||
} |
||||
} |
||||
#else |
||||
#if __CUDA_ARCH__ < 350 |
||||
__device__ void* DMatrix; |
||||
#endif |
||||
__global__ void lyra2Z_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} |
||||
__global__ void lyra2Z_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} |
||||
__global__ void lyra2Z_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) {} |
||||
#endif |
||||
|
||||
__host__ |
||||
void lyra2Z_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) |
||||
{ |
||||
// just assign the device pointer allocated in main loop |
||||
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); |
||||
cudaMalloc(&d_GNonces[thr_id], 2 * sizeof(uint32_t)); |
||||
cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t)); |
||||
} |
||||
|
||||
__host__ |
||||
void lyra2Z_cpu_init_sm2(int thr_id, uint32_t threads) |
||||
{ |
||||
// just assign the device pointer allocated in main loop |
||||
cudaMalloc(&d_GNonces[thr_id], 2 * sizeof(uint32_t)); |
||||
cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t)); |
||||
} |
||||
|
||||
__host__ |
||||
uint32_t lyra2Z_getSecNonce(int thr_id, int num) |
||||
{ |
||||
uint32_t results[2]; |
||||
memset(results, 0xFF, sizeof(results)); |
||||
cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost); |
||||
if (results[1] == results[0]) |
||||
return UINT32_MAX; |
||||
return results[num]; |
||||
} |
||||
|
||||
__host__ |
||||
void lyra2Z_setTarget(const void *pTargetIn) |
||||
{ |
||||
cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice); |
||||
} |
||||
|
||||
__host__ |
||||
uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti) |
||||
{ |
||||
uint32_t result = UINT32_MAX; |
||||
cudaMemset(d_GNonces[thr_id], 0xff, 2 * sizeof(uint32_t)); |
||||
int dev_id = device_map[thr_id % MAX_GPUS]; |
||||
|
||||
uint32_t tpb = TPB52; |
||||
|
||||
if (device_sm[dev_id] == 500) |
||||
tpb = TPB50; |
||||
if (device_sm[dev_id] == 200) |
||||
tpb = TPB20; |
||||
|
||||
dim3 grid1((threads * 4 + tpb - 1) / tpb); |
||||
dim3 block1(4, tpb >> 2); |
||||
|
||||
dim3 grid2((threads + 64 - 1) / 64); |
||||
dim3 block2(64); |
||||
|
||||
dim3 grid3((threads + tpb - 1) / tpb); |
||||
dim3 block3(tpb); |
||||
|
||||
if (device_sm[dev_id] >= 520) |
||||
{ |
||||
lyra2Z_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); |
||||
|
||||
lyra2Z_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash); |
||||
|
||||
lyra2Z_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash, d_GNonces[thr_id]); |
||||
} |
||||
else if (device_sm[dev_id] == 500 || device_sm[dev_id] == 350) |
||||
{ |
||||
size_t shared_mem = 0; |
||||
|
||||
if (gtx750ti) |
||||
// 8Warpに調整のため、8192バイト確保する |
||||
shared_mem = 8192; |
||||
else |
||||
// 10Warpに調整のため、6144バイト確保する |
||||
shared_mem = 6144; |
||||
|
||||
lyra2Z_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); |
||||
|
||||
lyra2Z_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash); |
||||
|
||||
lyra2Z_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash, d_GNonces[thr_id]); |
||||
} |
||||
else |
||||
lyra2Z_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash, d_GNonces[thr_id]); |
||||
|
||||
// get first found nonce |
||||
cudaMemcpy(h_GNonces[thr_id], d_GNonces[thr_id], 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost); |
||||
result = *h_GNonces[thr_id]; |
||||
|
||||
return result; |
||||
} |
@ -0,0 +1,819 @@
@@ -0,0 +1,819 @@
|
||||
#include <memory.h> |
||||
|
||||
#ifdef __INTELLISENSE__ |
||||
/* just for vstudio code colors */ |
||||
//#define __CUDA_ARCH__ 500 |
||||
#define __threadfence_block() |
||||
#define __ldg(x) *(x) |
||||
#define atomicMin(p,y) y |
||||
#endif |
||||
|
||||
#include "cuda_helper.h" |
||||
|
||||
#define TPB50 32 |
||||
|
||||
__constant__ uint32_t pTarget[8]; |
||||
|
||||
static __device__ __forceinline__ |
||||
void Gfunc(uint2 & a, uint2 &b, uint2 &c, uint2 &d) |
||||
{ |
||||
#if __CUDA_ARCH__ > 500 |
||||
a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y; |
||||
c += d; b ^= c; b = ROR24(b); |
||||
a += b; d ^= a; d = ROR16(d); |
||||
c += d; b ^= c; b = ROR2(b, 63); |
||||
#else |
||||
a += b; d ^= a; d = SWAPUINT2(d); |
||||
c += d; b ^= c; b = ROR2(b, 24); |
||||
a += b; d ^= a; d = ROR2(d, 16); |
||||
c += d; b ^= c; b = ROR2(b, 63); |
||||
#endif |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 350 |
||||
#include "cuda_lyra2_vectors.h" |
||||
|
||||
#define Nrow 8 |
||||
#define Ncol 8 |
||||
#define memshift 3 |
||||
|
||||
__device__ uint2 *DMatrix; |
||||
|
||||
__device__ __forceinline__ uint2 LD4S(const int index) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
|
||||
return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; |
||||
} |
||||
|
||||
__device__ __forceinline__ void ST4S(const int index, const uint2 data) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
|
||||
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; |
||||
} |
||||
|
||||
#if __CUDA_ARCH__ == 300 |
||||
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) |
||||
{ |
||||
return __shfl(a, b, c); |
||||
} |
||||
|
||||
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) |
||||
{ |
||||
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); |
||||
} |
||||
|
||||
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) |
||||
{ |
||||
a1 = WarpShuffle(a1, b1, c); |
||||
a2 = WarpShuffle(a2, b2, c); |
||||
a3 = WarpShuffle(a3, b3, c); |
||||
} |
||||
#else // != 300 |
||||
|
||||
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; |
||||
uint32_t *_ptr = (uint32_t*)shared_mem; |
||||
|
||||
__threadfence_block(); |
||||
uint32_t buf = _ptr[thread]; |
||||
|
||||
_ptr[thread] = a; |
||||
__threadfence_block(); |
||||
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; |
||||
|
||||
__threadfence_block(); |
||||
_ptr[thread] = buf; |
||||
|
||||
__threadfence_block(); |
||||
return result; |
||||
} |
||||
|
||||
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; |
||||
|
||||
__threadfence_block(); |
||||
uint2 buf = shared_mem[thread]; |
||||
|
||||
shared_mem[thread] = a; |
||||
__threadfence_block(); |
||||
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; |
||||
|
||||
__threadfence_block(); |
||||
shared_mem[thread] = buf; |
||||
|
||||
__threadfence_block(); |
||||
return result; |
||||
} |
||||
|
||||
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) |
||||
{ |
||||
extern __shared__ uint2 shared_mem[]; |
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; |
||||
|
||||
__threadfence_block(); |
||||
uint2 buf = shared_mem[thread]; |
||||
|
||||
shared_mem[thread] = a1; |
||||
__threadfence_block(); |
||||
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; |
||||
__threadfence_block(); |
||||
shared_mem[thread] = a2; |
||||
__threadfence_block(); |
||||
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; |
||||
__threadfence_block(); |
||||
shared_mem[thread] = a3; |
||||
__threadfence_block(); |
||||
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; |
||||
|
||||
__threadfence_block(); |
||||
shared_mem[thread] = buf; |
||||
__threadfence_block(); |
||||
} |
||||
|
||||
#endif // != 300 |
||||
|
||||
__device__ __forceinline__ void round_lyra(uint2 s[4]) |
||||
{ |
||||
Gfunc(s[0], s[1], s[2], s[3]); |
||||
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); |
||||
Gfunc(s[0], s[1], s[2], s[3]); |
||||
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void round_lyra(uint2x4* s) |
||||
{ |
||||
Gfunc(s[0].x, s[1].x, s[2].x, s[3].x); |
||||
Gfunc(s[0].y, s[1].y, s[2].y, s[3].y); |
||||
Gfunc(s[0].z, s[1].z, s[2].z, s[3].z); |
||||
Gfunc(s[0].w, s[1].w, s[2].w, s[3].w); |
||||
Gfunc(s[0].x, s[1].y, s[2].z, s[3].w); |
||||
Gfunc(s[0].y, s[1].z, s[2].w, s[3].x); |
||||
Gfunc(s[0].z, s[1].w, s[2].x, s[3].y); |
||||
Gfunc(s[0].w, s[1].x, s[2].y, s[3].z); |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads) |
||||
{ |
||||
uint2 state1[3], state2[3]; |
||||
|
||||
const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x; |
||||
|
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift; |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
ST4S(s0 + j, state[j]); |
||||
round_lyra(state); |
||||
} |
||||
|
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
const uint32_t s0 = memshift * Ncol * 0 + i * memshift; |
||||
const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x; |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = LD4S(s0 + j); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j]; |
||||
} |
||||
|
||||
// 1, 0, 2 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
const uint32_t s0 = memshift * Ncol * 0 + i * memshift; |
||||
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x; |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = *(DMatrix + s1 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = LD4S(s0 + j); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j]; |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} |
||||
else |
||||
{ |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
ST4S(s0 + j, state2[j]); |
||||
} |
||||
|
||||
// 2, 1, 3 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x; |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = *(DMatrix + s2 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = *(DMatrix + s1 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j]; |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} else { |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s1 + j*threads*blockDim.x) = state2[j]; |
||||
} |
||||
|
||||
// 3, 0, 4 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
const uint32_t ls0 = memshift * Ncol * 0 + i * memshift; |
||||
const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x; |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = *(DMatrix + s3 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = LD4S(ls0 + j); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j]; |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} else { |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s0 + j*threads*blockDim.x) = state2[j]; |
||||
} |
||||
|
||||
// 4, 3, 5 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x; |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = *(DMatrix + s4 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = *(DMatrix + s3 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j]; |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} |
||||
else |
||||
{ |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s3 + j*threads*blockDim.x) = state2[j]; |
||||
} |
||||
|
||||
// 5, 2, 6 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x; |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = *(DMatrix + s5 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = *(DMatrix + s2 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j]; |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} |
||||
else |
||||
{ |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s2 + j*threads*blockDim.x) = state2[j]; |
||||
} |
||||
|
||||
// 6, 1, 7 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x; |
||||
const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x; |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state1[j] = *(DMatrix + s6 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state2[j] = *(DMatrix + s1 + j*threads*blockDim.x); |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= state1[j] + state2[j]; |
||||
|
||||
round_lyra(state); |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j]; |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} else { |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
*(DMatrix + s1 + j*threads*blockDim.x) = state2[j]; |
||||
} |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) |
||||
{ |
||||
const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x; |
||||
|
||||
#pragma unroll 1 |
||||
for (int i = 0; i < 8; i++) |
||||
{ |
||||
uint2 state1[3], state2[3]; |
||||
|
||||
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; |
||||
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; |
||||
const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) { |
||||
state1[j] = *(DMatrix + s1 + j*threads*blockDim.x); |
||||
state2[j] = *(DMatrix + s2 + j*threads*blockDim.x); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) { |
||||
state1[j] += state2[j]; |
||||
state[j] ^= state1[j]; |
||||
} |
||||
|
||||
round_lyra(state); |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
state2[0] ^= Data2; |
||||
state2[1] ^= Data0; |
||||
state2[2] ^= Data1; |
||||
} else { |
||||
state2[0] ^= Data0; |
||||
state2[1] ^= Data1; |
||||
state2[2] ^= Data2; |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
{ |
||||
*(DMatrix + s2 + j*threads*blockDim.x) = state2[j]; |
||||
*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads) |
||||
{ |
||||
const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; |
||||
// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x; |
||||
|
||||
uint2 state1[3], last[3]; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) { |
||||
state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x); |
||||
last[j] = *(DMatrix + ps2 + j*threads*blockDim.x); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) { |
||||
state1[j] += last[j]; |
||||
state[j] ^= state1[j]; |
||||
} |
||||
|
||||
round_lyra(state); |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
last[0] ^= Data2; |
||||
last[1] ^= Data0; |
||||
last[2] ^= Data1; |
||||
} else { |
||||
last[0] ^= Data0; |
||||
last[1] ^= Data1; |
||||
last[2] ^= Data2; |
||||
} |
||||
|
||||
if (rowInOut == 5) |
||||
{ |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
last[j] ^= state[j]; |
||||
} |
||||
|
||||
for (int i = 1; i < 8; i++) |
||||
{ |
||||
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; |
||||
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x); |
||||
|
||||
round_lyra(state); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= last[j]; |
||||
} |
||||
|
||||
static __device__ __forceinline__ |
||||
void reduceDuplexRowV50_8_v2(const int rowIn, const int rowOut,const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads) |
||||
{ |
||||
const uint32_t ps1 = (memshift * Ncol * rowIn * threads + thread)*blockDim.x + threadIdx.x; |
||||
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; |
||||
// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x; |
||||
|
||||
uint2 state1[3], last[3]; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) { |
||||
state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x); |
||||
last[j] = *(DMatrix + ps2 + j*threads*blockDim.x); |
||||
} |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) { |
||||
state1[j] += last[j]; |
||||
state[j] ^= state1[j]; |
||||
} |
||||
|
||||
round_lyra(state); |
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) |
||||
uint2 Data0 = state[0]; |
||||
uint2 Data1 = state[1]; |
||||
uint2 Data2 = state[2]; |
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); |
||||
|
||||
if (threadIdx.x == 0) |
||||
{ |
||||
last[0] ^= Data2; |
||||
last[1] ^= Data0; |
||||
last[2] ^= Data1; |
||||
} |
||||
else { |
||||
last[0] ^= Data0; |
||||
last[1] ^= Data1; |
||||
last[2] ^= Data2; |
||||
} |
||||
|
||||
if (rowInOut == rowOut) |
||||
{ |
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
last[j] ^= state[j]; |
||||
} |
||||
|
||||
for (int i = 1; i < 8; i++) |
||||
{ |
||||
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; |
||||
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; |
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x); |
||||
|
||||
round_lyra(state); |
||||
} |
||||
|
||||
|
||||
#pragma unroll |
||||
for (int j = 0; j < 3; j++) |
||||
state[j] ^= last[j]; |
||||
|
||||
} |
||||
|
||||
|
||||
__global__ __launch_bounds__(64, 1) |
||||
void lyra2Z_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) |
||||
{ |
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
|
||||
const uint2x4 blake2b_IV[2] = { |
||||
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, |
||||
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } |
||||
}; |
||||
const uint2x4 Mask[2] = { |
||||
0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, |
||||
0x00000020UL, 0x00000000UL, 0x00000008UL, 0x00000000UL, |
||||
0x00000008UL, 0x00000000UL, 0x00000008UL, 0x00000000UL, |
||||
0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL |
||||
}; |
||||
if (thread < threads) |
||||
{ |
||||
uint2x4 state[4]; |
||||
|
||||
((uint2*)state)[0] = __ldg(&g_hash[thread]); |
||||
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); |
||||
((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]); |
||||
((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]); |
||||
|
||||
state[1] = state[0]; |
||||
state[2] = blake2b_IV[0]; |
||||
state[3] = blake2b_IV[1]; |
||||
|
||||
for (int i = 0; i < 12; i++) |
||||
round_lyra(state); //because 12 is not enough |
||||
|
||||
state[0] ^= Mask[0]; |
||||
state[1] ^= Mask[1]; |
||||
|
||||
for (int i = 0; i < 12; i++) |
||||
round_lyra(state); //because 12 is not enough |
||||
|
||||
|
||||
((uint2x4*)DMatrix)[0 * threads + thread] = state[0]; |
||||
((uint2x4*)DMatrix)[1 * threads + thread] = state[1]; |
||||
((uint2x4*)DMatrix)[2 * threads + thread] = state[2]; |
||||
((uint2x4*)DMatrix)[3 * threads + thread] = state[3]; |
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(TPB50, 1) |
||||
void lyra2Z_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) |
||||
{ |
||||
const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
uint2 state[4]; |
||||
|
||||
state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]); |
||||
state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]); |
||||
state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]); |
||||
state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]); |
||||
|
||||
reduceDuplexV5(state, thread, threads); |
||||
|
||||
uint32_t rowa; // = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
uint32_t prev = 7; |
||||
uint32_t iterator = 0; |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<8; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator + 3) & 7; |
||||
} |
||||
for (uint32_t i = 0; i<7; i++) { |
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads); |
||||
prev = iterator; |
||||
iterator = (iterator - 1) & 7; |
||||
} |
||||
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7; |
||||
reduceDuplexRowV50_8_v2(prev,iterator,rowa, state, thread, threads); |
||||
|
||||
DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0]; |
||||
DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1]; |
||||
DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2]; |
||||
DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3]; |
||||
} |
||||
} |
||||
|
||||
__global__ __launch_bounds__(64, 1) |
||||
void lyra2Z_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) |
||||
{ |
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); |
||||
|
||||
if (thread < threads) |
||||
{ |
||||
uint2x4 state[4]; |
||||
|
||||
state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]); |
||||
state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]); |
||||
state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]); |
||||
state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]); |
||||
|
||||
for (int i = 0; i < 12; i++) |
||||
round_lyra(state); |
||||
|
||||
uint32_t nonce = startNounce + thread; |
||||
if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) { |
||||
atomicMin(&resNonces[1], resNonces[0]); |
||||
atomicMin(&resNonces[0], nonce); |
||||
} |
||||
} |
||||
} |
||||
|
||||
#else |
||||
/* if __CUDA_ARCH__ != 500 .. host */ |
||||
__global__ void lyra2Z_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} |
||||
__global__ void lyra2Z_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} |
||||
__global__ void lyra2Z_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) {} |
||||
#endif |
@ -0,0 +1,164 @@
@@ -0,0 +1,164 @@
|
||||
extern "C" { |
||||
#include <sph/sph_blake.h> |
||||
#include "Lyra2Z.h" |
||||
} |
||||
|
||||
#include <miner.h> |
||||
#include <cuda_helper.h> |
||||
|
||||
static uint64_t* d_hash[MAX_GPUS]; |
||||
static uint64_t* d_matrix[MAX_GPUS]; |
||||
|
||||
extern void blake256_cpu_init(int thr_id, uint32_t threads); |
||||
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); |
||||
extern void blake256_cpu_setBlock_80(uint32_t *pdata); |
||||
|
||||
extern void lyra2Z_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); |
||||
extern void lyra2Z_cpu_init_sm2(int thr_id, uint32_t threads); |
||||
extern uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); |
||||
|
||||
extern void lyra2Z_setTarget(const void *ptarget); |
||||
extern uint32_t lyra2Z_getSecNonce(int thr_id, int num); |
||||
|
||||
extern "C" void lyra2Z_hash(void *state, const void *input) |
||||
{ |
||||
uint32_t _ALIGN(64) hashA[8], hashB[8]; |
||||
sph_blake256_context ctx_blake; |
||||
|
||||
sph_blake256_set_rounds(14); |
||||
sph_blake256_init(&ctx_blake); |
||||
sph_blake256(&ctx_blake, input, 80); |
||||
sph_blake256_close(&ctx_blake, hashA); |
||||
|
||||
LYRA2Z(hashB, 32, hashA, 32, hashA, 32, 8, 8, 8); |
||||
|
||||
memcpy(state, hashB, 32); |
||||
} |
||||
|
||||
static bool init[MAX_GPUS] = { 0 }; |
||||
static __thread uint32_t throughput = 0; |
||||
static __thread bool gtx750ti = false; |
||||
|
||||
extern "C" int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) |
||||
{ |
||||
uint32_t *pdata = work->data; |
||||
uint32_t *ptarget = work->target; |
||||
uint32_t _ALIGN(64) endiandata[20]; |
||||
const uint32_t first_nonce = pdata[19]; |
||||
int dev_id = device_map[thr_id]; |
||||
|
||||
if (opt_benchmark) |
||||
ptarget[7] = 0x00ff; |
||||
|
||||
if (!init[thr_id]) |
||||
{ |
||||
cudaSetDevice(dev_id); |
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) { |
||||
cudaDeviceReset(); |
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); |
||||
CUDA_LOG_ERROR(); |
||||
} |
||||
|
||||
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; |
||||
if (device_sm[dev_id] <= 500) intensity = 15; |
||||
throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; |
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); |
||||
|
||||
cudaDeviceProp props; |
||||
cudaGetDeviceProperties(&props, dev_id); |
||||
gtx750ti = (strstr(props.name, "750 Ti") != NULL); |
||||
|
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); |
||||
|
||||
blake256_cpu_init(thr_id, throughput); |
||||
|
||||
if (device_sm[dev_id] >= 350) |
||||
{ |
||||
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; |
||||
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); |
||||
lyra2Z_cpu_init(thr_id, throughput, d_matrix[thr_id]); |
||||
} |
||||
else |
||||
lyra2Z_cpu_init_sm2(thr_id, throughput); |
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); |
||||
|
||||
init[thr_id] = true; |
||||
} |
||||
|
||||
for (int k=0; k < 20; k++) |
||||
be32enc(&endiandata[k], pdata[k]); |
||||
|
||||
blake256_cpu_setBlock_80(pdata); |
||||
lyra2Z_setTarget(ptarget); |
||||
|
||||
do { |
||||
int order = 0; |
||||
|
||||
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput; |
||||
|
||||
work->nonces[0] = lyra2Z_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti); |
||||
|
||||
if (work->nonces[0] != UINT32_MAX) |
||||
{ |
||||
uint32_t _ALIGN(64) vhash[8]; |
||||
|
||||
be32enc(&endiandata[19], work->nonces[0]); |
||||
lyra2Z_hash(vhash, endiandata); |
||||
|
||||
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { |
||||
work->valid_nonces = 1; |
||||
work->nonces[1] = lyra2Z_getSecNonce(thr_id, 1); |
||||
work_set_target_ratio(work, vhash); |
||||
pdata[19] = work->nonces[0] + 1; |
||||
if (work->nonces[1] != UINT32_MAX) |
||||
{ |
||||
be32enc(&endiandata[19], work->nonces[1]); |
||||
lyra2Z_hash(vhash, endiandata); |
||||
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { |
||||
bn_set_target_ratio(work, vhash, 1); |
||||
work->valid_nonces++; |
||||
} |
||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // cursor |
||||
} |
||||
return work->valid_nonces; |
||||
} |
||||
else if (vhash[7] > ptarget[7]) { |
||||
gpu_increment_reject(thr_id); |
||||
if (!opt_quiet) gpulog(LOG_WARNING, thr_id, |
||||
"result for %08x does not validate on CPU!", work->nonces[0]); |
||||
pdata[19] = work->nonces[0]; |
||||
continue; |
||||
} |
||||
} |
||||
|
||||
if ((uint64_t)throughput + pdata[19] >= max_nonce) { |
||||
pdata[19] = max_nonce; |
||||
break; |
||||
} |
||||
pdata[19] += throughput; |
||||
|
||||
} while (!work_restart[thr_id].restart); |
||||
|
||||
*hashes_done = pdata[19] - first_nonce; |
||||
return 0; |
||||
} |
||||
|
||||
// cleanup |
||||
extern "C" void free_lyra2Z(int thr_id) |
||||
{ |
||||
int dev_id = device_map[thr_id]; |
||||
if (!init[thr_id]) |
||||
return; |
||||
|
||||
cudaThreadSynchronize(); |
||||
|
||||
cudaFree(d_hash[thr_id]); |
||||
if (device_sm[dev_id] >= 350) |
||||
cudaFree(d_matrix[thr_id]); |
||||
init[thr_id] = false; |
||||
|
||||
cudaDeviceSynchronize(); |
||||
} |
Loading…
Reference in new issue