lyra2: import latest nanashi code (v1)
This commit is contained in:
parent
35bfe8d191
commit
2520f9a388
@ -247,6 +247,7 @@ features.
|
||||
Aug. 10th 2016 v1.8.1
|
||||
SIA Blake2-B Algo (getwork over stratum for Suprnova)
|
||||
SIA Nanopool RPC (getwork over http)
|
||||
Update also the older lyra2 with Nanashi version
|
||||
|
||||
July 20th 2016 v1.8.0
|
||||
Pascal support with cuda 8
|
||||
|
@ -1,40 +1,214 @@
|
||||
/**
|
||||
* Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2
|
||||
* tpruvot@github 2015
|
||||
* Lyra2 (v1) cuda implementation based on djm34 work
|
||||
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#define TPB50 16
|
||||
#define TPB52 8
|
||||
#define TPB52 32
|
||||
|
||||
#include "cuda_lyra2_sm2.cuh"
|
||||
#include "cuda_lyra2_sm5.cuh"
|
||||
|
||||
#ifdef __INTELLISENSE__
|
||||
/* just for vstudio code colors */
|
||||
#define __CUDA_ARCH__ 500
|
||||
#define __CUDA_ARCH__ 520
|
||||
#endif
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ > 500
|
||||
|
||||
#include "cuda_vector_uint2x4.h"
|
||||
#include "cuda_lyra2_vectors.h"
|
||||
|
||||
#ifdef __INTELLISENSE__
|
||||
/* just for vstudio code colors */
|
||||
__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
|
||||
#endif
|
||||
|
||||
#define Nrow 8
|
||||
#define Ncol 8
|
||||
#define memshift 3
|
||||
|
||||
#define Ncol 8
|
||||
#define NcolMask 0x7
|
||||
#define BUF_COUNT 0
|
||||
|
||||
__device__ uint2x4* DMatrix;
|
||||
__device__ uint2 *DMatrix;
|
||||
|
||||
__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads)
|
||||
{
|
||||
#if BUF_COUNT != 8
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
|
||||
#endif
|
||||
#if BUF_COUNT != 0
|
||||
const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
|
||||
#endif
|
||||
|
||||
#if BUF_COUNT == 8
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
|
||||
#elif BUF_COUNT == 0
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
|
||||
#else
|
||||
if (row < BUF_COUNT)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads)
|
||||
{
|
||||
#if BUF_COUNT != 8
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
|
||||
#endif
|
||||
#if BUF_COUNT != 0
|
||||
const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
|
||||
#endif
|
||||
|
||||
#if BUF_COUNT == 8
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
|
||||
|
||||
#elif BUF_COUNT == 0
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
|
||||
|
||||
#else
|
||||
if (row < BUF_COUNT)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
|
||||
}
|
||||
else
|
||||
{
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
|
||||
{
|
||||
return __shfl(a, b, c);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
|
||||
{
|
||||
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
|
||||
{
|
||||
a1 = WarpShuffle(a1, b1, c);
|
||||
a2 = WarpShuffle(a2, b2, c);
|
||||
a3 = WarpShuffle(a3, b3, c);
|
||||
}
|
||||
|
||||
#else
|
||||
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
|
||||
{
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
|
||||
uint32_t *_ptr = (uint32_t*)shared_mem;
|
||||
|
||||
__threadfence_block();
|
||||
uint32_t buf = _ptr[thread];
|
||||
|
||||
_ptr[thread] = a;
|
||||
__threadfence_block();
|
||||
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
|
||||
|
||||
__threadfence_block();
|
||||
_ptr[thread] = buf;
|
||||
|
||||
__threadfence_block();
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
|
||||
{
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
|
||||
|
||||
__threadfence_block();
|
||||
uint2 buf = shared_mem[thread];
|
||||
|
||||
shared_mem[thread] = a;
|
||||
__threadfence_block();
|
||||
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
|
||||
|
||||
__threadfence_block();
|
||||
shared_mem[thread] = buf;
|
||||
|
||||
__threadfence_block();
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
|
||||
{
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
|
||||
|
||||
__threadfence_block();
|
||||
uint2 buf = shared_mem[thread];
|
||||
|
||||
shared_mem[thread] = a1;
|
||||
__threadfence_block();
|
||||
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
|
||||
__threadfence_block();
|
||||
shared_mem[thread] = a2;
|
||||
__threadfence_block();
|
||||
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
|
||||
__threadfence_block();
|
||||
shared_mem[thread] = a3;
|
||||
__threadfence_block();
|
||||
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
|
||||
|
||||
__threadfence_block();
|
||||
shared_mem[thread] = buf;
|
||||
__threadfence_block();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if __CUDA_ARCH__ > 500 || !defined(__CUDA_ARCH)
|
||||
static __device__ __forceinline__
|
||||
void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
|
||||
{
|
||||
a += b; d ^= a; d = SWAPUINT2(d);
|
||||
c += d; b ^= c; b = ROR2(b, 24);
|
||||
a += b; d ^= a; d = ROR2(d, 16);
|
||||
a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y;
|
||||
c += d; b ^= c; b = ROR24(b);
|
||||
a += b; d ^= a; d = ROR16(d);
|
||||
c += d; b ^= c; b = ROR2(b, 63);
|
||||
}
|
||||
#endif
|
||||
|
||||
__device__ __forceinline__ void round_lyra(uint2 s[4])
|
||||
{
|
||||
Gfunc(s[0], s[1], s[2], s[3]);
|
||||
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
|
||||
Gfunc(s[0], s[1], s[2], s[3]);
|
||||
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void round_lyra(uint2x4* s)
|
||||
@ -50,21 +224,24 @@ void round_lyra(uint2x4* s)
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void reduceDuplex(uint2x4 state[4], uint32_t thread)
|
||||
void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
|
||||
{
|
||||
uint2x4 state1[3];
|
||||
uint2 state1[3];
|
||||
|
||||
const uint32_t ps1 = (256 * thread);
|
||||
const uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread);
|
||||
#if __CUDA_ARCH__ > 500
|
||||
#pragma unroll
|
||||
#endif
|
||||
for (int i = 0; i < Nrow; i++)
|
||||
{
|
||||
ST4S(0, Ncol - i - 1, state, thread, threads);
|
||||
|
||||
round_lyra(state);
|
||||
}
|
||||
|
||||
#pragma unroll 4
|
||||
for (int i = 0; i < 8; i++)
|
||||
for (int i = 0; i < Nrow; i++)
|
||||
{
|
||||
const uint32_t s1 = ps1 + i*memshift;
|
||||
const uint32_t s2 = ps2 - i*memshift;
|
||||
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] = __ldg4(&(DMatrix+s1)[j]);
|
||||
LD4S(state1, 0, i, thread, threads);
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j];
|
||||
|
||||
@ -72,208 +249,324 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread)
|
||||
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] ^= state[j];
|
||||
for (int j = 0; j < 3; j++)
|
||||
(DMatrix + s2)[j] = state1[j];
|
||||
ST4S(1, Ncol - i - 1, state1, thread, threads);
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], uint32_t thread)
|
||||
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
|
||||
{
|
||||
uint2x4 state1[3], state2[3];
|
||||
|
||||
const uint32_t ps1 = ( memshift*8 * rowIn + 256 * thread);
|
||||
const uint32_t ps2 = ( memshift*8 * rowInOut + 256 * thread);
|
||||
const uint32_t ps3 = (memshift*7 + memshift*8 * rowOut + 256 * thread);
|
||||
uint2 state1[3], state2[3];
|
||||
|
||||
#pragma unroll 1
|
||||
for (int i = 0; i < 8; i++)
|
||||
for (int i = 0; i < Nrow; i++)
|
||||
{
|
||||
const uint32_t s1 = ps1 + i*memshift;
|
||||
const uint32_t s2 = ps2 + i*memshift;
|
||||
LD4S(state1, rowIn, i, thread, threads);
|
||||
LD4S(state2, rowInOut, i, thread, threads);
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j]= __ldg4(&(DMatrix + s1)[j]);
|
||||
for (int j = 0; j < 3; j++)
|
||||
state2[j]= __ldg4(&(DMatrix + s2)[j]);
|
||||
for (int j = 0; j < 3; j++) {
|
||||
uint2x4 tmp = state1[j] + state2[j];
|
||||
state[j] ^= tmp;
|
||||
}
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
for (int j = 0; j < 3; j++) {
|
||||
const uint32_t s3 = ps3 - i*memshift;
|
||||
state1[j] ^= state[j];
|
||||
(DMatrix + s3)[j] = state1[j];
|
||||
}
|
||||
|
||||
((uint2*)state2)[0] ^= ((uint2*)state)[11];
|
||||
|
||||
for (int j = 0; j < 11; j++)
|
||||
((uint2*)state2)[j+1] ^= ((uint2*)state)[j];
|
||||
|
||||
for (int j = 0; j < 3; j++)
|
||||
(DMatrix + s2)[j] = state2[j];
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread)
|
||||
{
|
||||
const uint32_t ps1 = (memshift * 8 * rowIn + 256 * thread);
|
||||
const uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread);
|
||||
const uint32_t ps3 = (memshift * 8 * rowOut + 256 * thread);
|
||||
|
||||
#pragma unroll 1
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
uint2x4 state1[3], state2[3];
|
||||
|
||||
const uint32_t s1 = ps1 + i*memshift;
|
||||
const uint32_t s2 = ps2 + i*memshift;
|
||||
|
||||
for (int j = 0; j < 3; j++) {
|
||||
state1[j] = __ldg4(&(DMatrix + s1)[j]);
|
||||
state2[j] = __ldg4(&(DMatrix + s2)[j]);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++) {
|
||||
state1[j] += state2[j];
|
||||
state[j] ^= state1[j];
|
||||
}
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] ^= state[j];
|
||||
|
||||
round_lyra(state);
|
||||
ST4S(rowOut, Ncol - i - 1, state1, thread, threads);
|
||||
|
||||
((uint2*)state2)[0] ^= ((uint2*)state)[11];
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
for (int j = 0; j < 11; j++)
|
||||
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
|
||||
|
||||
if (rowInOut == rowOut) {
|
||||
for (int j = 0; j < 3; j++) {
|
||||
state2[j] ^= state[j];
|
||||
(DMatrix + s2)[j]=state2[j];
|
||||
}
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
} else {
|
||||
const uint32_t s3 = ps3 + i*memshift;
|
||||
for (int j = 0; j < 3; j++) {
|
||||
(DMatrix + s2)[j] = state2[j];
|
||||
(DMatrix + s3)[j] ^= state[j];
|
||||
}
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
ST4S(rowInOut, i, state2, thread, threads);
|
||||
}
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ == 500
|
||||
__global__ __launch_bounds__(TPB50, 1)
|
||||
#else
|
||||
__global__ __launch_bounds__(TPB52, 2)
|
||||
#endif
|
||||
void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
|
||||
static __device__ __forceinline__
|
||||
void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
|
||||
{
|
||||
for (int i = 0; i < Nrow; i++)
|
||||
{
|
||||
uint2 state1[3], state2[3];
|
||||
|
||||
LD4S(state1, rowIn, i, thread, threads);
|
||||
LD4S(state2, rowInOut, i, thread, threads);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
}
|
||||
else
|
||||
{
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
ST4S(rowInOut, i, state2, thread, threads);
|
||||
|
||||
LD4S(state1, rowOut, i, thread, threads);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] ^= state[j];
|
||||
|
||||
ST4S(rowOut, i, state1, thread, threads);
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
|
||||
{
|
||||
uint2 state1[3], state2[3], last[3];
|
||||
|
||||
LD4S(state1, 2, 0, thread, threads);
|
||||
LD4S(last, rowInOut, 0, thread, threads);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + last[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
last[0] ^= Data2;
|
||||
last[1] ^= Data0;
|
||||
last[2] ^= Data1;
|
||||
} else {
|
||||
last[0] ^= Data0;
|
||||
last[1] ^= Data1;
|
||||
last[2] ^= Data2;
|
||||
}
|
||||
|
||||
if (rowInOut == 5)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
last[j] ^= state[j];
|
||||
}
|
||||
|
||||
for (int i = 1; i < Nrow; i++)
|
||||
{
|
||||
LD4S(state1, 2, i, thread, threads);
|
||||
LD4S(state2, rowInOut, i, thread, threads);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= last[j];
|
||||
}
|
||||
|
||||
__constant__ uint2x4 blake2b_IV[2] = {
|
||||
0xf3bcc908lu, 0x6a09e667lu,
|
||||
0x84caa73blu, 0xbb67ae85lu,
|
||||
0xfe94f82blu, 0x3c6ef372lu,
|
||||
0x5f1d36f1lu, 0xa54ff53alu,
|
||||
0xade682d1lu, 0x510e527flu,
|
||||
0x2b3e6c1flu, 0x9b05688clu,
|
||||
0xfb41bd6blu, 0x1f83d9ablu,
|
||||
0x137e2179lu, 0x5be0cd19lu
|
||||
};
|
||||
|
||||
__global__ __launch_bounds__(64, 1)
|
||||
void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
const uint2x4 blake2b_IV[2] = {
|
||||
{{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }},
|
||||
{{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }}
|
||||
};
|
||||
|
||||
if (thread < threads)
|
||||
{
|
||||
uint2x4 state[4];
|
||||
|
||||
((uint2*)state)[0] = __ldg(&g_hash[thread]);
|
||||
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
|
||||
((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
|
||||
((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
|
||||
|
||||
state[1] = state[0];
|
||||
state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
|
||||
state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
|
||||
state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
|
||||
state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]);
|
||||
state[2] = blake2b_IV[0];
|
||||
state[3] = blake2b_IV[1];
|
||||
|
||||
for (int i = 0; i<24; i++)
|
||||
round_lyra(state); //because 12 is not enough
|
||||
|
||||
const uint32_t ps1 = (memshift * 7 + 256 * thread);
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t s1 = ps1 - memshift * i;
|
||||
for (int j = 0; j < 3; j++)
|
||||
(DMatrix + s1)[j] = (state)[j];
|
||||
round_lyra(state);
|
||||
}
|
||||
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
|
||||
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
|
||||
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
|
||||
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
|
||||
}
|
||||
}
|
||||
|
||||
reduceDuplex(state, thread);
|
||||
__global__
|
||||
__launch_bounds__(TPB52, 1)
|
||||
void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
|
||||
{
|
||||
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
|
||||
|
||||
reduceDuplexRowSetup(1, 0, 2, state, thread);
|
||||
reduceDuplexRowSetup(2, 1, 3, state, thread);
|
||||
reduceDuplexRowSetup(3, 0, 4, state, thread);
|
||||
reduceDuplexRowSetup(4, 3, 5, state, thread);
|
||||
reduceDuplexRowSetup(5, 2, 6, state, thread);
|
||||
reduceDuplexRowSetup(6, 1, 7, state, thread);
|
||||
if (thread < threads)
|
||||
{
|
||||
uint2 state[4];
|
||||
state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
|
||||
state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
|
||||
state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
|
||||
state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
|
||||
|
||||
uint32_t rowa = state[0].x.x & 7;
|
||||
reduceDuplexRowt(7, rowa, 0, state, thread);
|
||||
rowa = state[0].x.x & 7;
|
||||
reduceDuplexRowt(0, rowa, 3, state, thread);
|
||||
rowa = state[0].x.x & 7;
|
||||
reduceDuplexRowt(3, rowa, 6, state, thread);
|
||||
rowa = state[0].x.x & 7;
|
||||
reduceDuplexRowt(6, rowa, 1, state, thread);
|
||||
rowa = state[0].x.x & 7;
|
||||
reduceDuplexRowt(1, rowa, 4, state, thread);
|
||||
rowa = state[0].x.x & 7;
|
||||
reduceDuplexRowt(4, rowa, 7, state, thread);
|
||||
rowa = state[0].x.x & 7;
|
||||
reduceDuplexRowt(7, rowa, 2, state, thread);
|
||||
rowa = state[0].x.x & 7;
|
||||
reduceDuplexRowt(2, rowa, 5, state, thread);
|
||||
reduceDuplex(state, thread, threads);
|
||||
reduceDuplexRowSetup(1, 0, 2, state, thread, threads);
|
||||
reduceDuplexRowSetup(2, 1, 3, state, thread, threads);
|
||||
reduceDuplexRowSetup(3, 0, 4, state, thread, threads);
|
||||
reduceDuplexRowSetup(4, 3, 5, state, thread, threads);
|
||||
reduceDuplexRowSetup(5, 2, 6, state, thread, threads);
|
||||
reduceDuplexRowSetup(6, 1, 7, state, thread, threads);
|
||||
|
||||
const int32_t shift = (memshift * 8 * rowa + 256 * thread);
|
||||
uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowt(7, rowa, 0, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowt(0, rowa, 3, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowt(3, rowa, 6, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowt(6, rowa, 1, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowt(1, rowa, 4, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowt(4, rowa, 7, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowt(7, rowa, 2, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowt_8(rowa, state, thread, threads);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= __ldg4(&(DMatrix + shift)[j]);
|
||||
DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
|
||||
DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
|
||||
DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
|
||||
DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(64, 1)
|
||||
void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
|
||||
{
|
||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
uint28 state[4];
|
||||
|
||||
if (thread < threads)
|
||||
{
|
||||
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
|
||||
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
|
||||
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
|
||||
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
|
||||
|
||||
for (int i = 0; i < 12; i++)
|
||||
round_lyra(state);
|
||||
|
||||
g_hash[thread] = ((uint2*)state)[0];
|
||||
g_hash[thread + threads] = ((uint2*)state)[1];
|
||||
g_hash[thread + threads*2] = ((uint2*)state)[2];
|
||||
g_hash[thread + threads*3] = ((uint2*)state)[3];
|
||||
}
|
||||
g_hash[thread + threads * 0] = state[0].x;
|
||||
g_hash[thread + threads * 1] = state[0].y;
|
||||
g_hash[thread + threads * 2] = state[0].z;
|
||||
g_hash[thread + threads * 3] = state[0].w;
|
||||
|
||||
} //thread
|
||||
}
|
||||
#else
|
||||
#if __CUDA_ARCH__ < 500
|
||||
|
||||
/* for unsupported SM arch */
|
||||
__device__ void* DMatrix;
|
||||
__global__ void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
|
||||
#endif
|
||||
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
|
||||
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
|
||||
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
|
||||
#endif
|
||||
|
||||
__host__
|
||||
void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix)
|
||||
void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
|
||||
{
|
||||
cuda_get_arch(thr_id);
|
||||
// just assign the device pointer allocated in main loop
|
||||
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
__host__
|
||||
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
|
||||
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
|
||||
{
|
||||
int dev_id = device_map[thr_id % MAX_GPUS];
|
||||
|
||||
uint32_t tpb = TPB52;
|
||||
if (device_sm[dev_id] == 500) tpb = TPB50;
|
||||
if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not)
|
||||
if (device_sm[dev_id] <= 300) tpb = TPB30;
|
||||
|
||||
dim3 grid((threads + tpb - 1) / tpb);
|
||||
dim3 block(tpb);
|
||||
if (cuda_arch[dev_id] >= 520) tpb = TPB52;
|
||||
else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
|
||||
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
|
||||
|
||||
if (device_sm[dev_id] >= 500)
|
||||
lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_hash);
|
||||
dim3 grid1((threads * 4 + tpb - 1) / tpb);
|
||||
dim3 block1(4, tpb >> 2);
|
||||
|
||||
dim3 grid2((threads + 64 - 1) / 64);
|
||||
dim3 block2(64);
|
||||
|
||||
dim3 grid3((threads + tpb - 1) / tpb);
|
||||
dim3 block3(tpb);
|
||||
|
||||
if (cuda_arch[dev_id] >= 520)
|
||||
{
|
||||
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
|
||||
|
||||
lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash);
|
||||
|
||||
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
|
||||
}
|
||||
else if (cuda_arch[dev_id] >= 500)
|
||||
{
|
||||
size_t shared_mem = 0;
|
||||
|
||||
if (gtx750ti)
|
||||
// 8Warpに調整のため、8192バイト確保する
|
||||
shared_mem = 8192;
|
||||
else
|
||||
// 10Warpに調整のため、6144バイト確保する
|
||||
shared_mem = 6144;
|
||||
|
||||
lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
|
||||
|
||||
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash);
|
||||
|
||||
lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
|
||||
}
|
||||
else
|
||||
lyra2_gpu_hash_32_sm2 <<< grid, block >>> (threads, startNounce, d_hash);
|
||||
|
||||
lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash);
|
||||
}
|
||||
|
@ -3,15 +3,16 @@
|
||||
#ifdef __INTELLISENSE__
|
||||
/* just for vstudio code colors */
|
||||
#undef __CUDA_ARCH__
|
||||
#define __CUDA_ARCH__ 300
|
||||
#define __CUDA_ARCH__ 500
|
||||
#endif
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
#define TPB30 160
|
||||
#define TPB20 160
|
||||
|
||||
#if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__)
|
||||
__constant__ static uint2 blake2b_IV[8] = {
|
||||
__constant__ static uint2 blake2b_IV_sm2[8] = {
|
||||
{ 0xf3bcc908, 0x6a09e667 },
|
||||
{ 0x84caa73b, 0xbb67ae85 },
|
||||
{ 0xfe94f82b, 0x3c6ef372 },
|
||||
@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i<8; i++) {
|
||||
state[i + 8] = blake2b_IV[i];
|
||||
state[i + 8] = blake2b_IV_sm2[i];
|
||||
}
|
||||
|
||||
// blake2blyra x2
|
||||
|
693
lyra2/cuda_lyra2_sm5.cuh
Normal file
693
lyra2/cuda_lyra2_sm5.cuh
Normal file
@ -0,0 +1,693 @@
|
||||
#include <memory.h>
|
||||
|
||||
#ifdef __INTELLISENSE__
|
||||
/* just for vstudio code colors */
|
||||
#undef __CUDA_ARCH__
|
||||
#define __CUDA_ARCH__ 500
|
||||
#endif
|
||||
|
||||
#include "cuda_helper.h"
|
||||
|
||||
#define TPB50 32
|
||||
|
||||
#if __CUDA_ARCH__ == 500
|
||||
#include "cuda_lyra2_vectors.h"
|
||||
|
||||
#define Nrow 8
|
||||
#define Ncol 8
|
||||
#define memshift 3
|
||||
|
||||
__device__ uint2 *DMatrix;
|
||||
|
||||
__device__ __forceinline__ uint2 LD4S(const int index)
|
||||
{
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
|
||||
return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void ST4S(const int index, const uint2 data)
|
||||
{
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
|
||||
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ == 300
|
||||
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
|
||||
{
|
||||
return __shfl(a, b, c);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
|
||||
{
|
||||
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
|
||||
{
|
||||
a1 = WarpShuffle(a1, b1, c);
|
||||
a2 = WarpShuffle(a2, b2, c);
|
||||
a3 = WarpShuffle(a3, b3, c);
|
||||
}
|
||||
|
||||
#else
|
||||
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
|
||||
{
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
|
||||
uint32_t *_ptr = (uint32_t*)shared_mem;
|
||||
|
||||
__threadfence_block();
|
||||
uint32_t buf = _ptr[thread];
|
||||
|
||||
_ptr[thread] = a;
|
||||
__threadfence_block();
|
||||
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
|
||||
|
||||
__threadfence_block();
|
||||
_ptr[thread] = buf;
|
||||
|
||||
__threadfence_block();
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
|
||||
{
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
|
||||
|
||||
__threadfence_block();
|
||||
uint2 buf = shared_mem[thread];
|
||||
|
||||
shared_mem[thread] = a;
|
||||
__threadfence_block();
|
||||
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
|
||||
|
||||
__threadfence_block();
|
||||
shared_mem[thread] = buf;
|
||||
|
||||
__threadfence_block();
|
||||
return result;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
|
||||
{
|
||||
extern __shared__ uint2 shared_mem[];
|
||||
|
||||
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
|
||||
|
||||
__threadfence_block();
|
||||
uint2 buf = shared_mem[thread];
|
||||
|
||||
shared_mem[thread] = a1;
|
||||
__threadfence_block();
|
||||
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
|
||||
__threadfence_block();
|
||||
shared_mem[thread] = a2;
|
||||
__threadfence_block();
|
||||
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
|
||||
__threadfence_block();
|
||||
shared_mem[thread] = a3;
|
||||
__threadfence_block();
|
||||
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
|
||||
|
||||
__threadfence_block();
|
||||
shared_mem[thread] = buf;
|
||||
__threadfence_block();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if __CUDA_ARCH__ >= 300
|
||||
static __device__ __forceinline__
|
||||
void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
|
||||
{
|
||||
a += b; d ^= a; d = SWAPUINT2(d);
|
||||
c += d; b ^= c; b = ROR24(b); //ROR2(b, 24);
|
||||
a += b; d ^= a; d = ROR16(d);
|
||||
c += d; b ^= c; b = ROR2(b, 63);
|
||||
}
|
||||
#endif
|
||||
|
||||
__device__ __forceinline__ void round_lyra(uint2 s[4])
|
||||
{
|
||||
Gfunc(s[0], s[1], s[2], s[3]);
|
||||
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
|
||||
Gfunc(s[0], s[1], s[2], s[3]);
|
||||
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void round_lyra(uint2x4* s)
|
||||
{
|
||||
Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
|
||||
Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
|
||||
Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
|
||||
Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
|
||||
Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
|
||||
Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
|
||||
Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
|
||||
Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads)
|
||||
{
|
||||
uint2 state1[3], state2[3];
|
||||
|
||||
const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x;
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
ST4S(s0 + j, state[j]);
|
||||
round_lyra(state);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
|
||||
const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] = LD4S(s0 + j);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j];
|
||||
}
|
||||
|
||||
// 1, 0, 2
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
|
||||
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state2[j] = LD4S(s0 + j);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j];
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
}
|
||||
else
|
||||
{
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
ST4S(s0 + j, state2[j]);
|
||||
}
|
||||
|
||||
// 2, 1, 3
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] = *(DMatrix + s2 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j];
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
} else {
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
|
||||
}
|
||||
|
||||
// 3, 0, 4
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t ls0 = memshift * Ncol * 0 + i * memshift;
|
||||
const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] = *(DMatrix + s3 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state2[j] = LD4S(ls0 + j);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j];
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
} else {
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s0 + j*threads*blockDim.x) = state2[j];
|
||||
}
|
||||
|
||||
// 4, 3, 5
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] = *(DMatrix + s4 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state2[j] = *(DMatrix + s3 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j];
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
}
|
||||
else
|
||||
{
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s3 + j*threads*blockDim.x) = state2[j];
|
||||
}
|
||||
|
||||
// 5, 2, 6
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] = *(DMatrix + s5 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j];
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
}
|
||||
else
|
||||
{
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
|
||||
}
|
||||
|
||||
// 6, 1, 7
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x;
|
||||
const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x;
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state1[j] = *(DMatrix + s6 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= state1[j] + state2[j];
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j];
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
} else {
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
|
||||
{
|
||||
const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x;
|
||||
|
||||
#pragma unroll 1
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
uint2 state1[3], state2[3];
|
||||
|
||||
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
|
||||
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
|
||||
const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++) {
|
||||
state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
|
||||
state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++) {
|
||||
state1[j] += state2[j];
|
||||
state[j] ^= state1[j];
|
||||
}
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
state2[0] ^= Data2;
|
||||
state2[1] ^= Data0;
|
||||
state2[2] ^= Data1;
|
||||
} else {
|
||||
state2[0] ^= Data0;
|
||||
state2[1] ^= Data1;
|
||||
state2[2] ^= Data2;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
|
||||
*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __device__ __forceinline__
|
||||
void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
|
||||
{
|
||||
const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x;
|
||||
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
|
||||
// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
|
||||
|
||||
uint2 state1[3], last[3];
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++) {
|
||||
state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
|
||||
last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++) {
|
||||
state1[j] += last[j];
|
||||
state[j] ^= state1[j];
|
||||
}
|
||||
|
||||
round_lyra(state);
|
||||
|
||||
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
|
||||
uint2 Data0 = state[0];
|
||||
uint2 Data1 = state[1];
|
||||
uint2 Data2 = state[2];
|
||||
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
|
||||
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
last[0] ^= Data2;
|
||||
last[1] ^= Data0;
|
||||
last[2] ^= Data1;
|
||||
} else {
|
||||
last[0] ^= Data0;
|
||||
last[1] ^= Data1;
|
||||
last[2] ^= Data2;
|
||||
}
|
||||
|
||||
if (rowInOut == 5)
|
||||
{
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
last[j] ^= state[j];
|
||||
}
|
||||
|
||||
for (int i = 1; i < 8; i++)
|
||||
{
|
||||
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
|
||||
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
|
||||
|
||||
round_lyra(state);
|
||||
}
|
||||
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < 3; j++)
|
||||
state[j] ^= last[j];
|
||||
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(64, 1)
|
||||
void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
const uint2x4 blake2b_IV[2] = {
|
||||
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
|
||||
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
|
||||
};
|
||||
|
||||
if (thread < threads)
|
||||
{
|
||||
uint2x4 state[4];
|
||||
|
||||
((uint2*)state)[0] = __ldg(&g_hash[thread]);
|
||||
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
|
||||
((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]);
|
||||
((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]);
|
||||
|
||||
state[1] = state[0];
|
||||
state[2] = blake2b_IV[0];
|
||||
state[3] = blake2b_IV[1];
|
||||
|
||||
for (int i = 0; i < 24; i++)
|
||||
round_lyra(state); //because 12 is not enough
|
||||
|
||||
((uint2x4*)DMatrix)[0 * threads + thread] = state[0];
|
||||
((uint2x4*)DMatrix)[1 * threads + thread] = state[1];
|
||||
((uint2x4*)DMatrix)[2 * threads + thread] = state[2];
|
||||
((uint2x4*)DMatrix)[3 * threads + thread] = state[3];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(TPB50, 1)
|
||||
void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
|
||||
{
|
||||
const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
|
||||
|
||||
if (thread < threads)
|
||||
{
|
||||
uint2 state[4];
|
||||
|
||||
state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
|
||||
state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
|
||||
state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
|
||||
state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]);
|
||||
|
||||
reduceDuplexV5(state, thread, threads);
|
||||
|
||||
uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowV50(7, rowa, 0, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowV50(0, rowa, 3, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowV50(3, rowa, 6, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowV50(6, rowa, 1, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowV50(1, rowa, 4, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowV50(4, rowa, 7, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowV50(7, rowa, 2, state, thread, threads);
|
||||
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
|
||||
reduceDuplexRowV50_8(rowa, state, thread, threads);
|
||||
|
||||
DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0];
|
||||
DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1];
|
||||
DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2];
|
||||
DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(64, 1)
|
||||
void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
if (thread < threads)
|
||||
{
|
||||
uint2x4 state[4];
|
||||
|
||||
state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
|
||||
state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
|
||||
state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
|
||||
state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]);
|
||||
|
||||
for (int i = 0; i < 12; i++)
|
||||
round_lyra(state);
|
||||
|
||||
g_hash[thread] = ((uint2*)state)[0];
|
||||
g_hash[thread + threads] = ((uint2*)state)[1];
|
||||
g_hash[thread + threads * 2] = ((uint2*)state)[2];
|
||||
g_hash[thread + threads * 3] = ((uint2*)state)[3];
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
/* if __CUDA_ARCH__ != 500 .. host */
|
||||
__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
|
||||
__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
|
||||
__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
|
||||
#endif
|
@ -13,7 +13,7 @@
|
||||
#include "cuda_helper.h"
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
#define __shfl(x, y) (x)
|
||||
#define __shfl(x, y, z) (x)
|
||||
#endif
|
||||
|
||||
#if __CUDA_ARCH__ < 320 && !defined(__ldg4)
|
||||
|
@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
|
||||
extern void skein256_cpu_init(int thr_id, uint32_t threads);
|
||||
|
||||
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
|
||||
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
|
||||
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
|
||||
|
||||
extern void groestl256_cpu_init(int thr_id, uint32_t threads);
|
||||
extern void groestl256_cpu_free(int thr_id);
|
||||
@ -79,36 +79,55 @@ extern "C" void lyra2re_hash(void *state, const void *input)
|
||||
}
|
||||
|
||||
static bool init[MAX_GPUS] = { 0 };
|
||||
static uint32_t throughput[MAX_GPUS] = { 0 };
|
||||
|
||||
extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
uint32_t *pdata = work->data;
|
||||
uint32_t *ptarget = work->target;
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 17 : 16;
|
||||
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
|
||||
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||
|
||||
if (opt_benchmark)
|
||||
ptarget[7] = 0x000f;
|
||||
ptarget[7] = 0x00ff;
|
||||
|
||||
static __thread bool gtx750ti;
|
||||
if (!init[thr_id])
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
int dev_id = device_map[thr_id];
|
||||
cudaSetDevice(dev_id);
|
||||
CUDA_LOG_ERROR();
|
||||
|
||||
blake256_cpu_init(thr_id, throughput);
|
||||
keccak256_cpu_init(thr_id,throughput);
|
||||
skein256_cpu_init(thr_id, throughput);
|
||||
groestl256_cpu_init(thr_id, throughput);
|
||||
int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
|
||||
if (device_sm[device_map[thr_id]] == 500) intensity = 15;
|
||||
int temp = intensity;
|
||||
throughput[thr_id] = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
|
||||
if (init[thr_id]) throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);
|
||||
|
||||
// DMatrix
|
||||
cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput);
|
||||
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
|
||||
cudaDeviceProp props;
|
||||
cudaGetDeviceProperties(&props, dev_id);
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
|
||||
if (strstr(props.name, "750 Ti")) gtx750ti = true;
|
||||
else gtx750ti = false;
|
||||
|
||||
blake256_cpu_init(thr_id, throughput[thr_id]);
|
||||
keccak256_cpu_init(thr_id, throughput[thr_id]);
|
||||
skein256_cpu_init(thr_id, throughput[thr_id]);
|
||||
groestl256_cpu_init(thr_id, throughput[thr_id]);
|
||||
|
||||
if (device_sm[dev_id] >= 500)
|
||||
{
|
||||
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
|
||||
lyra2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);
|
||||
}
|
||||
|
||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));
|
||||
|
||||
init[thr_id] = true;
|
||||
if (temp != intensity){
|
||||
gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
|
||||
intensity, throughput[thr_id]);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t _ALIGN(128) endiandata[20];
|
||||
@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
int order = 0;
|
||||
uint32_t foundNonce;
|
||||
|
||||
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
blake256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
|
||||
keccak256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
|
||||
lyra2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], gtx750ti);
|
||||
skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
|
||||
TRACE("S")
|
||||
|
||||
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||
*hashes_done = pdata[19] - first_nonce + throughput[thr_id];
|
||||
|
||||
foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||
foundNonce = groestl256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
|
||||
if (foundNonce != UINT32_MAX)
|
||||
{
|
||||
uint32_t _ALIGN(64) vhash64[8];
|
||||
@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
}
|
||||
}
|
||||
|
||||
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
|
||||
if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
|
||||
pdata[19] = max_nonce;
|
||||
break;
|
||||
}
|
||||
pdata[19] += throughput;
|
||||
pdata[19] += throughput[thr_id];
|
||||
|
||||
} while (!work_restart[thr_id].restart);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user