Browse Source

Handle lyra2v3 algo, for VTC fork

mostly imported from opensourced vertcoin-miner with a few fixes
pull/5/head
Tanguy Pruvot 6 years ago
parent
commit
9a1f20d455
  1. 1
      Makefile.am
  2. 7
      README.txt
  3. 4
      algos.h
  4. 1
      bench.cpp
  5. 8
      ccminer.cpp
  6. 3
      ccminer.vcxproj
  7. 9
      ccminer.vcxproj.filters
  8. 2
      compat/ccminer-config.h
  9. 173
      lyra2/Lyra2.c
  10. 1
      lyra2/Lyra2.h
  11. 481
      lyra2/cuda_lyra2v3.cu
  12. 348
      lyra2/cuda_lyra2v3_sm3.cuh
  13. 182
      lyra2/lyra2REv3.cu
  14. 3
      miner.h
  15. 3
      util.cpp

1
Makefile.am

@ -38,6 +38,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
lyra2/Lyra2.c lyra2/Sponge.c \ lyra2/Lyra2.c lyra2/Sponge.c \
lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \ lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
lyra2/lyra2REv3.cu lyra2/cuda_lyra2v3.cu \
lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \ lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
lyra2/allium.cu \ lyra2/allium.cu \
Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \

7
README.txt

@ -1,5 +1,5 @@
ccminer 2.3 "phi2 and cryptonight variants" ccminer 2.3.1 "lyra2v3, exosis and sha256q"
--------------------------------------------------------------- ---------------------------------------------------------------
*************************************************************** ***************************************************************
@ -100,7 +100,8 @@ its command line interface and options.
lbry use to mine LBRY Credits lbry use to mine LBRY Credits
luffa use to mine Joincoin luffa use to mine Joincoin
lyra2 use to mine CryptoCoin lyra2 use to mine CryptoCoin
lyra2v2 use to mine Vertcoin lyra2v2 use to mine Monacoin
lyra2v3 use to mine Vertcoin
lyra2z use to mine Zerocoin (XZC) lyra2z use to mine Zerocoin (XZC)
monero use to mine Monero (XMR) monero use to mine Monero (XMR)
myr-gr use to mine Myriad-Groest myr-gr use to mine Myriad-Groest
@ -117,7 +118,7 @@ its command line interface and options.
scrypt-jane use to mine Chacha coins like Cache and Ultracoin scrypt-jane use to mine Chacha coins like Cache and Ultracoin
s3 use to mine 1coin (ONE) s3 use to mine 1coin (ONE)
sha256t use to mine OneCoin (OC) sha256t use to mine OneCoin (OC)
sha256q use to mine Pyrite sha256q use to mine Pyrite
sia use to mine SIA sia use to mine SIA
sib use to mine Sibcoin sib use to mine Sibcoin
skein use to mine Skeincoin skein use to mine Skeincoin

4
algos.h

@ -34,6 +34,7 @@ enum sha_algos {
ALGO_LUFFA, ALGO_LUFFA,
ALGO_LYRA2, ALGO_LYRA2,
ALGO_LYRA2v2, ALGO_LYRA2v2,
ALGO_LYRA2v3,
ALGO_LYRA2Z, ALGO_LYRA2Z,
ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MJOLLNIR, /* Hefty hash */
ALGO_MYR_GR, ALGO_MYR_GR,
@ -115,6 +116,7 @@ static const char *algo_names[] = {
"luffa", "luffa",
"lyra2", "lyra2",
"lyra2v2", "lyra2v2",
"lyra2v3",
"lyra2z", "lyra2z",
"mjollnir", "mjollnir",
"myr-gr", "myr-gr",
@ -199,6 +201,8 @@ static inline int algo_to_int(char* arg)
i = ALGO_LYRA2; i = ALGO_LYRA2;
else if (!strcasecmp("lyra2rev2", arg)) else if (!strcasecmp("lyra2rev2", arg))
i = ALGO_LYRA2v2; i = ALGO_LYRA2v2;
else if (!strcasecmp("lyra2rev3", arg))
i = ALGO_LYRA2v3;
else if (!strcasecmp("phi1612", arg)) else if (!strcasecmp("phi1612", arg))
i = ALGO_PHI; i = ALGO_PHI;
else if (!strcasecmp("bitcoin", arg)) else if (!strcasecmp("bitcoin", arg))

1
bench.cpp

@ -78,6 +78,7 @@ void algo_free_all(int thr_id)
free_luffa(thr_id); free_luffa(thr_id);
free_lyra2(thr_id); free_lyra2(thr_id);
free_lyra2v2(thr_id); free_lyra2v2(thr_id);
free_lyra2v3(thr_id);
free_lyra2Z(thr_id); free_lyra2Z(thr_id);
free_myriad(thr_id); free_myriad(thr_id);
free_neoscrypt(thr_id); free_neoscrypt(thr_id);

8
ccminer.cpp

@ -269,7 +269,8 @@ Options:\n\
lbry LBRY Credits (Sha/Ripemd)\n\ lbry LBRY Credits (Sha/Ripemd)\n\
luffa Joincoin\n\ luffa Joincoin\n\
lyra2 CryptoCoin\n\ lyra2 CryptoCoin\n\
lyra2v2 VertCoin\n\ lyra2v2 MonaCoin\n\
lyra2v3 Vertcoin\n\
lyra2z ZeroCoin (3rd impl)\n\ lyra2z ZeroCoin (3rd impl)\n\
myr-gr Myriad-Groestl\n\ myr-gr Myriad-Groestl\n\
monero XMR cryptonight (v7)\n\ monero XMR cryptonight (v7)\n\
@ -1742,6 +1743,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_KECCAKC: case ALGO_KECCAKC:
case ALGO_LBRY: case ALGO_LBRY:
case ALGO_LYRA2v2: case ALGO_LYRA2v2:
case ALGO_LYRA2v3:
case ALGO_LYRA2Z: case ALGO_LYRA2Z:
case ALGO_PHI2: case ALGO_PHI2:
case ALGO_TIMETRAVEL: case ALGO_TIMETRAVEL:
@ -2283,6 +2285,7 @@ static void *miner_thread(void *userdata)
case ALGO_JHA: case ALGO_JHA:
case ALGO_HSR: case ALGO_HSR:
case ALGO_LYRA2v2: case ALGO_LYRA2v2:
case ALGO_LYRA2v3:
case ALGO_PHI: case ALGO_PHI:
case ALGO_PHI2: case ALGO_PHI2:
case ALGO_POLYTIMOS: case ALGO_POLYTIMOS:
@ -2474,6 +2477,9 @@ static void *miner_thread(void *userdata)
case ALGO_LYRA2v2: case ALGO_LYRA2v2:
rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done);
break; break;
case ALGO_LYRA2v3:
rc = scanhash_lyra2v3(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_LYRA2Z: case ALGO_LYRA2Z:
rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done);
break; break;

3
ccminer.vcxproj

@ -530,6 +530,9 @@
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" /> <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" /> <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
<ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" /> <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
<CudaCompile Include="lyra2\lyra2REv3.cu" />
<CudaCompile Include="lyra2\cuda_lyra2v3.cu" />
<ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh" />
<CudaCompile Include="lyra2\lyra2Z.cu" /> <CudaCompile Include="lyra2\lyra2Z.cu" />
<CudaCompile Include="lyra2\cuda_lyra2Z.cu" /> <CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
<ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh" /> <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh" />

9
ccminer.vcxproj.filters

@ -946,6 +946,15 @@
<CudaCompile Include="lyra2\lyra2REv2.cu"> <CudaCompile Include="lyra2\lyra2REv2.cu">
<Filter>Source Files\CUDA\lyra2</Filter> <Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="lyra2\cuda_lyra2v3.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
<ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<CudaCompile Include="lyra2\lyra2REv3.cu">
<Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile>
<CudaCompile Include="lyra2\cuda_lyra2Z.cu"> <CudaCompile Include="lyra2\cuda_lyra2Z.cu">
<Filter>Source Files\CUDA\lyra2</Filter> <Filter>Source Files\CUDA\lyra2</Filter>
</CudaCompile> </CudaCompile>

2
compat/ccminer-config.h

@ -164,7 +164,7 @@
#define PACKAGE_URL "http://github.com/tpruvot/ccminer" #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "2.3" #define PACKAGE_VERSION "2.3.1"
/* If using the C implementation of alloca, define if you know the /* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be direction of stack growth for your system; otherwise it will be

173
lyra2/Lyra2.c

@ -212,3 +212,176 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
return 0; return 0;
} }
int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
{
//============================= Basic variables ============================//
int64_t row = 2; //index of row to be processed
int64_t prev = 1; //index of prev (last row ever computed/modified)
int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
int64_t tau; //Time Loop iterator
int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
int64_t i; //auxiliary iteration counter
int64_t v64; // 64bit var for memcpy
uint64_t instance = 0;
//==========================================================================/
//========== Initializing the Memory Matrix and pointers to it =============//
//Tries to allocate enough space for the whole memory matrix
const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
// for Lyra2REv2, nCols = 4, v1 was using 8
const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
size_t sz = (size_t)ROW_LEN_BYTES * nRows;
uint64_t *wholeMatrix = malloc(sz);
if (wholeMatrix == NULL) {
return -1;
}
memset(wholeMatrix, 0, sz);
//Allocates pointers to each row of the matrix
uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
if (memMatrix == NULL) {
return -1;
}
//Places the pointers in the correct positions
uint64_t *ptrWord = wholeMatrix;
for (i = 0; i < nRows; i++) {
memMatrix[i] = ptrWord;
ptrWord += ROW_LEN_INT64;
}
//==========================================================================/
//============= Getting the password + salt + basil padded with 10*1 ===============//
//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
//but this ensures that the password copied locally will be overwritten as soon as possible
//First, we clean enough blocks for the password, salt, basil and padding
int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
byte *ptrByte = (byte*) wholeMatrix;
//Prepends the password
memcpy(ptrByte, pwd, pwdlen);
ptrByte += pwdlen;
//Concatenates the salt
memcpy(ptrByte, salt, saltlen);
ptrByte += saltlen;
memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)));
//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
memcpy(ptrByte, &kLen, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = pwdlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = saltlen;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = timeCost;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nRows;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
v64 = nCols;
memcpy(ptrByte, &v64, sizeof(int64_t));
ptrByte += sizeof(uint64_t);
//Now comes the padding
*ptrByte = 0x80; //first byte of padding: right after the password
ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
//==========================================================================/
//======================= Initializing the Sponge State ====================//
//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
uint64_t state[16];
initState(state);
//==========================================================================/
//================================ Setup Phase =============================//
//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
ptrWord = wholeMatrix;
for (i = 0; i < nBlocksInput; i++) {
absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
}
//Initializes M[0] and M[1]
reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
do {
//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
//updates the value of row* (deterministically picked during Setup))
rowa = (rowa + step) & (window - 1);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
row++;
//Checks if all rows in the window where visited.
if (rowa == 0) {
step = window + gap; //changes the step: approximately doubles its value
window *= 2; //doubles the size of the re-visitation window
gap = -gap; //inverts the modifier to the step
}
} while (row < nRows);
//==========================================================================/
//============================ Wandering Phase =============================//
row = 0; //Resets the visitation to the first row of the memory matrix
for (tau = 1; tau <= timeCost; tau++) {
//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
do {
//Selects a pseudorandom index row* (the only change in REv3)
//------------------------------------------------------------------------------------------
instance = state[instance & 0xF];
rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
//rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
//update prev: it now points to the last row ever computed
prev = row;
//updates row: goes to the next row to be computed
//------------------------------------------------------------------------------------------
row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
//------------------------------------------------------------------------------------------
} while (row != 0);
}
//============================ Wrap-up Phase ===============================//
//Absorbs the last block of the memory matrix
absorbBlock(state, memMatrix[rowa]);
//Squeezes the key
squeeze(state, K, (unsigned int) kLen);
//========================= Freeing the memory =============================//
free(memMatrix);
free(wholeMatrix);
return 0;
}

1
lyra2/Lyra2.h

@ -38,5 +38,6 @@ typedef unsigned char byte;
#endif #endif
int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
#endif /* LYRA2_H_ */ #endif /* LYRA2_H_ */

481
lyra2/cuda_lyra2v3.cu

@ -0,0 +1,481 @@
/**
* Lyra2 (v3) CUDA Implementation
*
* Based on VTC sources
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include "cuda_helper.h"
#include "cuda_lyra2v3_sm3.cuh"
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#define __CUDA_ARCH__ 500
#endif
#define TPB 32
#if __CUDA_ARCH__ >= 500
#include "cuda_lyra2_vectors.h"
#define Nrow 4
#define Ncol 4
#define memshift 3
__device__ uint2x4 *DMatrix;
__device__ __forceinline__ uint2 LD4S(const int index)
{
extern __shared__ uint2 shared_mem[];
return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
}
__device__ __forceinline__ void ST4S(const int index, const uint2 data)
{
extern __shared__ uint2 shared_mem[];
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
}
__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c)
{
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
}
__device__ __forceinline__
void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
{
a += b; d ^= a; d = SWAPUINT2(d);
c += d; b ^= c; b = ROR2(b, 24);
a += b; d ^= a; d = ROR2(d, 16);
c += d; b ^= c; b = ROR2(b, 63);
}
__device__ __forceinline__
void round_lyra_v5(uint2x4 s[4])
{
Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y);
Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z);
Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w);
Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w);
Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x);
Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y);
Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
}
__device__ __forceinline__
void round_lyra_v5(uint2 s[4])
{
Gfunc_v5(s[0], s[1], s[2], s[3]);
s[1] = shuffle2(s[1], threadIdx.x + 1, 4);
s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
s[3] = shuffle2(s[3], threadIdx.x + 3, 4);
Gfunc_v5(s[0], s[1], s[2], s[3]);
s[1] = shuffle2(s[1], threadIdx.x + 3, 4);
s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
s[3] = shuffle2(s[3], threadIdx.x + 1, 4);
}
__device__ __forceinline__
void reduceDuplexRowSetup2(uint2 state[4])
{
uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
int i, j;
#pragma unroll
for (int i = 0; i < Ncol; i++)
{
#pragma unroll
for (j = 0; j < 3; j++)
state0[Ncol - i - 1][j] = state[j];
round_lyra_v5(state);
}
//#pragma unroll 4
for (i = 0; i < Ncol; i++)
{
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= state0[i][j];
round_lyra_v5(state);
#pragma unroll
for (j = 0; j < 3; j++)
state1[Ncol - i - 1][j] = state0[i][j];
#pragma unroll
for (j = 0; j < 3; j++)
state1[Ncol - i - 1][j] ^= state[j];
}
for (i = 0; i < Ncol; i++)
{
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= state1[i][j] + state0[i][j];
round_lyra_v5(state);
#pragma unroll
for (j = 0; j < 3; j++)
state2[j] = state1[i][j];
#pragma unroll
for (j = 0; j < 3; j++)
state2[j] ^= state[j];
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s2 + j, state2[j]);
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
if (threadIdx.x == 0) {
state0[i][0] ^= Data2;
state0[i][1] ^= Data0;
state0[i][2] ^= Data1;
} else {
state0[i][0] ^= Data0;
state0[i][1] ^= Data1;
state0[i][2] ^= Data2;
}
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s0 + j, state0[i][j]);
#pragma unroll
for (j = 0; j < 3; j++)
state0[i][j] = state2[j];
}
for (i = 0; i < Ncol; i++)
{
const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
round_lyra_v5(state);
#pragma unroll
for (j = 0; j < 3; j++)
state0[Ncol - i - 1][j] ^= state[j];
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s3 + j, state0[Ncol - i - 1][j]);
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
if (threadIdx.x == 0) {
state1[i][0] ^= Data2;
state1[i][1] ^= Data0;
state1[i][2] ^= Data1;
} else {
state1[i][0] ^= Data0;
state1[i][1] ^= Data1;
state1[i][2] ^= Data2;
}
#pragma unroll
for (j = 0; j < 3; j++)
ST4S(s1 + j, state1[i][j]);
}
}
__device__
void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
{
uint2 state1[3], state2[3];
const uint32_t ps1 = memshift * Ncol * rowIn;
const uint32_t ps2 = memshift * Ncol * rowInOut;
const uint32_t ps3 = memshift * Ncol * rowOut;
for (int i = 0; i < Ncol; i++)
{
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
const uint32_t s3 = ps3 + i*memshift;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = LD4S(s1 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = LD4S(s2 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra_v5(state);
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
if (threadIdx.x == 0) {
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
} else {
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s2 + j, state2[j]);
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
}
}
__device__
void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4])
{
const int rowIn = 2;
const int rowOut = 3;
int i, j;
uint2 last[3];
const uint32_t ps1 = memshift * Ncol * rowIn;
const uint32_t ps2 = memshift * Ncol * rowInOut;
#pragma unroll
for (int j = 0; j < 3; j++)
last[j] = LD4S(ps2 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= LD4S(ps1 + j) + last[j];
round_lyra_v5(state);
uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
if (threadIdx.x == 0) {
last[0] ^= Data2;
last[1] ^= Data0;
last[2] ^= Data1;
} else {
last[0] ^= Data0;
last[1] ^= Data1;
last[2] ^= Data2;
}
if (rowInOut == rowOut)
{
#pragma unroll
for (j = 0; j < 3; j++)
last[j] ^= state[j];
}
for (i = 1; i < Ncol; i++)
{
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
#pragma unroll
for (j = 0; j < 3; j++)
state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);
round_lyra_v5(state);
}
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= last[j];
}
__global__
__launch_bounds__(TPB, 1)
void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
const uint2x4 blake2b_IV[2] = {
0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL,
0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL,
0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL,
0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL
};
const uint2x4 Mask[2] = {
0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL,
0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL,
0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
};
uint2x4 state[4];
if (thread < threads)
{
state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]);
state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]);
state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]);
state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]);
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<12; i++)
round_lyra_v5(state);
state[0] ^= Mask[0];
state[1] ^= Mask[1];
for (int i = 0; i<12; i++)
round_lyra_v5(state);
DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
}
}
__global__
__launch_bounds__(TPB, 1)
void lyra2v3_gpu_hash_32_2(uint32_t threads)
{
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
if (thread < threads)
{
uint2 state[4];
state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
reduceDuplexRowSetup2(state);
uint32_t rowa;
int prev = 3;
unsigned int instance = 0;
for (int i = 0; i < 3; i++)
{
instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
//rowa = __shfl(state[0].x, 0, 4) & 3;
reduceDuplexRowt2(prev, rowa, i, state);
prev = i;
}
instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
//rowa = __shfl(state[0].x, 0, 4) & 3;
reduceDuplexRowt2x4(rowa, state);
((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
}
}
__global__
__launch_bounds__(TPB, 1)
void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
uint2x4 state[4];
if (thread < threads)
{
state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
for (int i = 0; i < 12; i++)
round_lyra_v5(state);
outputHash[thread + threads * 0] = state[0].x;
outputHash[thread + threads * 1] = state[0].y;
outputHash[thread + threads * 2] = state[0].z;
outputHash[thread + threads * 3] = state[0].w;
}
}
#else
#include "cuda_helper.h"
#if __CUDA_ARCH__ < 200
__device__ void* DMatrix;
#endif
__global__ void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {}
__global__ void lyra2v3_gpu_hash_32_2(uint32_t threads) {}
__global__ void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {}
#endif
__host__
void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
{
cuda_get_arch(thr_id);
// just assign the device pointer allocated in main loop
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
}
__host__
void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
{
int dev_id = device_map[thr_id % MAX_GPUS];
if (device_sm[dev_id] >= 500) {
const uint32_t tpb = TPB;
dim3 grid2((threads + tpb - 1) / tpb);
dim3 block2(tpb);
dim3 grid4((threads * 4 + tpb - 1) / tpb);
dim3 block4(4, tpb / 4);
lyra2v3_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
lyra2v3_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads);
lyra2v3_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
} else {
uint32_t tpb = 16;
if (cuda_arch[dev_id] >= 350) tpb = TPB35;
else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
dim3 grid((threads + tpb - 1) / tpb);
dim3 block(tpb);
lyra2v3_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash);
}
}

348
lyra2/cuda_lyra2v3_sm3.cuh

@ -0,0 +1,348 @@
/* SM 2/3/3.5 Variant for lyra2REv2 */
#ifdef __INTELLISENSE__
/* just for vstudio code colors, only uncomment that temporary, dont commit it */
//#undef __CUDA_ARCH__
//#define __CUDA_ARCH__ 500
#endif
#define TPB20 64
#define TPB30 64
#define TPB35 64
#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
#include "cuda_lyra2_vectors.h"
#define Nrow 4
#define Ncol 4
#define vectype ulonglong4
#define memshift 4
__device__ vectype *DMatrix;
static __device__ __forceinline__
void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
{
a += b; d ^= a; d = ROTR64(d, 32);
c += d; b ^= c; b = ROTR64(b, 24);
a += b; d ^= a; d = ROTR64(d, 16);
c += d; b ^= c; b = ROTR64(b, 63);
}
static __device__ __forceinline__
void round_lyra_v35(vectype* s)
{
Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
}
static __device__ __forceinline__
void reduceDuplexV3(vectype state[4], uint32_t thread)
{
vectype state1[3];
uint32_t ps1 = (Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
#pragma unroll 4
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow * i *memshift;
uint32_t s2 = ps2 - Nrow * i *memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra_v35(state);
for (int j = 0; j < 3; j++)
state1[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state1[j];
}
}
static __device__ __forceinline__
void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
{
vectype state2[3], state1[3];
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread);
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow*i*memshift;
uint32_t s2 = ps2 + Nrow*i*memshift;
uint32_t s3 = ps3 - Nrow*i*memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1 )[j]);
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2 )[j]);
for (int j = 0; j < 3; j++) {
vectype tmp = state1[j] + state2[j];
state[j] ^= tmp;
}
round_lyra_v35(state);
for (int j = 0; j < 3; j++) {
state1[j] ^= state[j];
(DMatrix + s3)[j] = state1[j];
}
((uint2*)state2)[0] ^= ((uint2*)state)[11];
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
}
static __device__ __forceinline__
void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
{
vectype state1[3], state2[3];
uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread);
uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread);
#pragma nounroll
for (int i = 0; i < Ncol; i++)
{
uint32_t s1 = ps1 + Nrow * i*memshift;
uint32_t s2 = ps2 + Nrow * i*memshift;
uint32_t s3 = ps3 + Nrow * i*memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix + s1)[j]);
for (int j = 0; j < 3; j++)
state2[j] = __ldg4(&(DMatrix + s2)[j]);
for (int j = 0; j < 3; j++)
state1[j] += state2[j];
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra_v35(state);
((uint2*)state2)[0] ^= ((uint2*)state)[11];
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
if (rowInOut != rowOut) {
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
for (int j = 0; j < 3; j++)
(DMatrix + s3)[j] ^= state[j];
} else {
for (int j = 0; j < 3; j++)
state2[j] ^= state[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
}
}
#if __CUDA_ARCH__ >= 300
__global__ __launch_bounds__(TPB35, 1)
void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
vectype state[4];
vectype blake2b_IV[2];
vectype padding[2];
if (threadIdx.x == 0) {
((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
);
((uint16*)padding)[0] = make_uint16(
0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
);
}
if (thread < threads)
{
((uint2*)state)[0] = __ldg(&outputHash[thread]);
((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
state[1] = state[0];
state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
for (int i = 0; i<12; i++)
round_lyra_v35(state);
state[0] ^= shuffle4(((vectype*)padding)[0], 0);
state[1] ^= shuffle4(((vectype*)padding)[1], 0);
for (int i = 0; i<12; i++)
round_lyra_v35(state);
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t s1 = ps1 - 4 * memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix + s1)[j] = (state)[j];
round_lyra_v35(state);
}
reduceDuplexV3(state, thread);
reduceDuplexRowSetupV3(1, 0, 2, state, thread);
reduceDuplexRowSetupV3(2, 1, 3, state, thread);
unsigned int instance = 0;
uint32_t rowa;
int prev = 3;
for (int i = 0; i < 4; i++)
{
//rowa = ((uint2*)state)[0].x & 3;
instance = ((uint2*)state)[instance & 0xf].x;
rowa = ((uint2*)state)[instance & 0xf].x & 0x3;
reduceDuplexRowtV3(prev, rowa, i, state, thread);
prev = i;
}
uint32_t shift = (memshift * rowa + 16 * memshift * thread);
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
for (int i = 0; i < 12; i++)
round_lyra_v35(state);
outputHash[thread] = ((uint2*)state)[0];
outputHash[thread + threads] = ((uint2*)state)[1];
outputHash[thread + 2 * threads] = ((uint2*)state)[2];
outputHash[thread + 3 * threads] = ((uint2*)state)[3];
} //thread
}
#elif __CUDA_ARCH__ >= 200
__global__ __launch_bounds__(TPB20, 1)
void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
vectype state[4];
vectype blake2b_IV[2];
vectype padding[2];
((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
);
((uint16*)padding)[0] = make_uint16(
0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
);
if (thread < threads)
{
((uint2*)state)[0] = outputHash[thread];
((uint2*)state)[1] = outputHash[thread + threads];
((uint2*)state)[2] = outputHash[thread + 2 * threads];
((uint2*)state)[3] = outputHash[thread + 3 * threads];
state[1] = state[0];
state[2] = ((vectype*)blake2b_IV)[0];
state[3] = ((vectype*)blake2b_IV)[1];
for (int i = 0; i<12; i++)
round_lyra_v35(state);
state[0] ^= ((vectype*)padding)[0];
state[1] ^= ((vectype*)padding)[1];
for (int i = 0; i<12; i++)
round_lyra_v35(state);
uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
//#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t s1 = ps1 - 4 * memshift * i;
for (int j = 0; j < 3; j++)
(DMatrix + s1)[j] = (state)[j];
round_lyra_v35(state);
}
reduceDuplexV3(state, thread);
reduceDuplexRowSetupV3(1, 0, 2, state, thread);
reduceDuplexRowSetupV3(2, 1, 3, state, thread);
uint instance = 0;
uint32_t rowa;
int prev = 3;
for (int i = 0; i < 4; i++)
{
// rowa = ((uint2*)state)[0].x & 3;
instance = ((uint2*)state)[instance & 0xf];
rowa = ((uint2*)state)[instance & 0xf] & 0x3;
reduceDuplexRowtV3(prev, rowa, i, state, thread);
prev = i;
}
uint32_t shift = (memshift * rowa + 16 * memshift * thread);
for (int j = 0; j < 3; j++)
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
for (int i = 0; i < 12; i++)
round_lyra_v35(state);
outputHash[thread] = ((uint2*)state)[0];
outputHash[thread + threads] = ((uint2*)state)[1];
outputHash[thread + 2 * threads] = ((uint2*)state)[2];
outputHash[thread + 3 * threads] = ((uint2*)state)[3];
} //thread
}
#endif
#else
/* host & sm5+ */
__global__ void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
#endif

182
lyra2/lyra2REv3.cu

@ -0,0 +1,182 @@
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_cubehash.h"
#include "lyra2/Lyra2.h"
}
#include <miner.h>
#include <cuda_helper.h>
static uint64_t *d_hash[MAX_GPUS];
static uint64_t* d_matrix[MAX_GPUS];
extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
extern void lyra2v3_setTarget(const void *pTargetIn);
extern void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);
extern void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void lyra2v3_cpu_hash_32_targ(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
extern void bmw256_setTarget(const void *ptarget);
extern void bmw256_cpu_init(int thr_id, uint32_t threads);
extern void bmw256_cpu_free(int thr_id);
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
extern "C" void lyra2v3_hash(void *state, const void *input)
{
uint32_t hashA[8], hashB[8];
sph_blake256_context ctx_blake;
sph_cubehash256_context ctx_cube;
sph_bmw256_context ctx_bmw;
sph_blake256_set_rounds(14);
sph_blake256_init(&ctx_blake);
sph_blake256(&ctx_blake, input, 80);
sph_blake256_close(&ctx_blake, hashA);
LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
sph_cubehash256_init(&ctx_cube);
sph_cubehash256(&ctx_cube, hashB, 32);
sph_cubehash256_close(&ctx_cube, hashA);
LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
sph_bmw256_init(&ctx_bmw);
sph_bmw256(&ctx_bmw, hashB, 32);
sph_bmw256_close(&ctx_bmw, hashA);
memcpy(state, hashA, 32);
}
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
if (strstr(device_name[dev_id], "GTX 10")) intensity = 20;
uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
ptarget[7] = 0x000f;
if (!init[thr_id])
{
size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
blake256_cpu_init(thr_id, throughput);
bmw256_cpu_init(thr_id, throughput);
cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256
// SM 3 implentation requires a bit more memory
if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
lyra2v3_cpu_init(thr_id, throughput, d_matrix[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
api_set_throughput(thr_id, throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
blake256_cpu_setBlock_80(pdata);
bmw256_setTarget(ptarget);
do {
int order = 0;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
memset(work->nonces, 0, sizeof(work->nonces));
bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);
*hashes_done = pdata[19] - first_nonce + throughput;
if (work->nonces[0] != 0)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
lyra2v3_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
lyra2v3_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart && !abort_flag);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_lyra2v3(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
cudaFree(d_matrix[thr_id]);
init[thr_id] = false;
cudaDeviceSynchronize();
}

3
miner.h

@ -298,6 +298,7 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi
extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@ -372,6 +373,7 @@ extern void free_lbry(int thr_id);
extern void free_luffa(int thr_id); extern void free_luffa(int thr_id);
extern void free_lyra2(int thr_id); extern void free_lyra2(int thr_id);
extern void free_lyra2v2(int thr_id); extern void free_lyra2v2(int thr_id);
extern void free_lyra2v3(int thr_id);
extern void free_lyra2Z(int thr_id); extern void free_lyra2Z(int thr_id);
extern void free_myriad(int thr_id); extern void free_myriad(int thr_id);
extern void free_neoscrypt(int thr_id); extern void free_neoscrypt(int thr_id);
@ -929,6 +931,7 @@ void jha_hash(void *output, const void *input);
void lbry_hash(void *output, const void *input); void lbry_hash(void *output, const void *input);
void lyra2re_hash(void *state, const void *input); void lyra2re_hash(void *state, const void *input);
void lyra2v2_hash(void *state, const void *input); void lyra2v2_hash(void *state, const void *input);
void lyra2v3_hash(void *state, const void *input);
void lyra2Z_hash(void *state, const void *input); void lyra2Z_hash(void *state, const void *input);
void myriadhash(void *state, const void *input); void myriadhash(void *state, const void *input);
void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void neoscrypt(uchar *output, const uchar *input, uint32_t profile);

3
util.cpp

@ -2246,6 +2246,9 @@ void print_hash_tests(void)
lyra2v2_hash(&hash[0], &buf[0]); lyra2v2_hash(&hash[0], &buf[0]);
printpfx("lyra2v2", hash); printpfx("lyra2v2", hash);
lyra2v3_hash(&hash[0], &buf[0]);
printpfx("lyra2v3", hash);
lyra2Z_hash(&hash[0], &buf[0]); lyra2Z_hash(&hash[0], &buf[0]);
printpfx("lyra2z", hash); printpfx("lyra2z", hash);

Loading…
Cancel
Save