Browse Source

lyra2: import latest nanashi code (v1)

2upstream
Tanguy Pruvot 8 years ago
parent
commit
2520f9a388
  1. 1
      README.txt
  2. 603
      lyra2/cuda_lyra2.cu
  3. 7
      lyra2/cuda_lyra2_sm2.cuh
  4. 693
      lyra2/cuda_lyra2_sm5.cuh
  5. 2
      lyra2/cuda_lyra2_vectors.h
  6. 63
      lyra2/lyra2RE.cu

1
README.txt

@ -247,6 +247,7 @@ features.
Aug. 10th 2016 v1.8.1 Aug. 10th 2016 v1.8.1
SIA Blake2-B Algo (getwork over stratum for Suprnova) SIA Blake2-B Algo (getwork over stratum for Suprnova)
SIA Nanopool RPC (getwork over http) SIA Nanopool RPC (getwork over http)
Update also the older lyra2 with Nanashi version
July 20th 2016 v1.8.0 July 20th 2016 v1.8.0
Pascal support with cuda 8 Pascal support with cuda 8

603
lyra2/cuda_lyra2.cu

@ -1,40 +1,214 @@
/** /**
* Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2 * Lyra2 (v1) cuda implementation based on djm34 work
* tpruvot@github 2015 * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
*/ */
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#define TPB50 16 #define TPB52 32
#define TPB52 8
#include "cuda_lyra2_sm2.cuh" #include "cuda_lyra2_sm2.cuh"
#include "cuda_lyra2_sm5.cuh"
#ifdef __INTELLISENSE__ #ifdef __INTELLISENSE__
/* just for vstudio code colors */ /* just for vstudio code colors */
#define __CUDA_ARCH__ 500 #define __CUDA_ARCH__ 520
#endif #endif
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ > 500
#include "cuda_vector_uint2x4.h" #include "cuda_lyra2_vectors.h"
#define memshift 3 #ifdef __INTELLISENSE__
/* just for vstudio code colors */
__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
#endif
#define Nrow 8
#define Ncol 8 #define Ncol 8
#define NcolMask 0x7 #define memshift 3
#define BUF_COUNT 0
__device__ uint2 *DMatrix;
__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads)
{
#if BUF_COUNT != 8
extern __shared__ uint2 shared_mem[];
const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
#endif
#if BUF_COUNT != 0
const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
#endif
#if BUF_COUNT == 8
#pragma unroll
for (int j = 0; j < 3; j++)
res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
#elif BUF_COUNT == 0
#pragma unroll
for (int j = 0; j < 3; j++)
res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
#else
if (row < BUF_COUNT)
{
#pragma unroll
for (int j = 0; j < 3; j++)
res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
}
else
{
#pragma unroll
for (int j = 0; j < 3; j++)
res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
}
#endif
}
__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads)
{
#if BUF_COUNT != 8
extern __shared__ uint2 shared_mem[];
const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
#endif
#if BUF_COUNT != 0
const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
#endif
#if BUF_COUNT == 8
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
#elif BUF_COUNT == 0
#pragma unroll
for (int j = 0; j < 3; j++)
shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
#else
if (row < BUF_COUNT)
{
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
}
else
{
#pragma unroll
for (int j = 0; j < 3; j++)
shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
}
#endif
}
#if __CUDA_ARCH__ >= 300
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
return __shfl(a, b, c);
}
__device__ uint2x4* DMatrix; __device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
a1 = WarpShuffle(a1, b1, c);
a2 = WarpShuffle(a2, b2, c);
a3 = WarpShuffle(a3, b3, c);
}
#else
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
uint32_t *_ptr = (uint32_t*)shared_mem;
__threadfence_block();
uint32_t buf = _ptr[thread];
_ptr[thread] = a;
__threadfence_block();
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
_ptr[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a;
__threadfence_block();
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a1;
__threadfence_block();
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
__threadfence_block();
shared_mem[thread] = a2;
__threadfence_block();
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
__threadfence_block();
shared_mem[thread] = a3;
__threadfence_block();
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
}
#endif
#if __CUDA_ARCH__ > 500 || !defined(__CUDA_ARCH)
static __device__ __forceinline__ static __device__ __forceinline__
void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d) void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
{ {
a += b; d ^= a; d = SWAPUINT2(d); a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y;
c += d; b ^= c; b = ROR2(b, 24); c += d; b ^= c; b = ROR24(b);
a += b; d ^= a; d = ROR2(d, 16); a += b; d ^= a; d = ROR16(d);
c += d; b ^= c; b = ROR2(b, 63); c += d; b ^= c; b = ROR2(b, 63);
} }
#endif
__device__ __forceinline__ void round_lyra(uint2 s[4])
{
Gfunc(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
Gfunc(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
}
static __device__ __forceinline__ static __device__ __forceinline__
void round_lyra(uint2x4* s) void round_lyra(uint2x4* s)
@ -50,21 +224,24 @@ void round_lyra(uint2x4* s)
} }
static __device__ __forceinline__ static __device__ __forceinline__
void reduceDuplex(uint2x4 state[4], uint32_t thread) void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
{ {
uint2x4 state1[3]; uint2 state1[3];
const uint32_t ps1 = (256 * thread); #if __CUDA_ARCH__ > 500
const uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread); #pragma unroll
#endif
for (int i = 0; i < Nrow; i++)
{
ST4S(0, Ncol - i - 1, state, thread, threads);
round_lyra(state);
}
#pragma unroll 4 #pragma unroll 4
for (int i = 0; i < 8; i++) for (int i = 0; i < Nrow; i++)
{ {
const uint32_t s1 = ps1 + i*memshift; LD4S(state1, 0, i, thread, threads);
const uint32_t s2 = ps2 - i*memshift;
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix+s1)[j]);
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
state[j] ^= state1[j]; state[j] ^= state1[j];
@ -72,208 +249,324 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread)
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
state1[j] ^= state[j]; state1[j] ^= state[j];
for (int j = 0; j < 3; j++) ST4S(1, Ncol - i - 1, state1, thread, threads);
(DMatrix + s2)[j] = state1[j];
} }
} }
static __device__ __forceinline__ static __device__ __forceinline__
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], uint32_t thread) void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
{ {
uint2x4 state1[3], state2[3]; uint2 state1[3], state2[3];
const uint32_t ps1 = ( memshift*8 * rowIn + 256 * thread);
const uint32_t ps2 = ( memshift*8 * rowInOut + 256 * thread);
const uint32_t ps3 = (memshift*7 + memshift*8 * rowOut + 256 * thread);
#pragma unroll 1 #pragma unroll 1
for (int i = 0; i < 8; i++) for (int i = 0; i < Nrow; i++)
{ {
const uint32_t s1 = ps1 + i*memshift; LD4S(state1, rowIn, i, thread, threads);
const uint32_t s2 = ps2 + i*memshift; LD4S(state2, rowInOut, i, thread, threads);
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
state1[j]= __ldg4(&(DMatrix + s1)[j]); state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
state2[j]= __ldg4(&(DMatrix + s2)[j]); state1[j] ^= state[j];
for (int j = 0; j < 3; j++) {
uint2x4 tmp = state1[j] + state2[j]; ST4S(rowOut, Ncol - i - 1, state1, thread, threads);
state[j] ^= tmp;
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
} else {
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
} }
ST4S(rowInOut, i, state2, thread, threads);
}
}
static __device__ __forceinline__
void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
{
for (int i = 0; i < Nrow; i++)
{
uint2 state1[3], state2[3];
LD4S(state1, rowIn, i, thread, threads);
LD4S(state2, rowInOut, i, thread, threads);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state); round_lyra(state);
for (int j = 0; j < 3; j++) { //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
const uint32_t s3 = ps3 - i*memshift; uint2 Data0 = state[0];
state1[j] ^= state[j]; uint2 Data1 = state[1];
(DMatrix + s3)[j] = state1[j]; uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
} }
((uint2*)state2)[0] ^= ((uint2*)state)[11]; ST4S(rowInOut, i, state2, thread, threads);
for (int j = 0; j < 11; j++) LD4S(state1, rowOut, i, thread, threads);
((uint2*)state2)[j+1] ^= ((uint2*)state)[j];
#pragma unroll
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j]; state1[j] ^= state[j];
ST4S(rowOut, i, state1, thread, threads);
} }
} }
static __device__ __forceinline__ static __device__ __forceinline__
void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread) void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
{ {
const uint32_t ps1 = (memshift * 8 * rowIn + 256 * thread); uint2 state1[3], state2[3], last[3];
const uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread);
const uint32_t ps3 = (memshift * 8 * rowOut + 256 * thread);
#pragma unroll 1 LD4S(state1, 2, 0, thread, threads);
for (int i = 0; i < 8; i++) LD4S(last, rowInOut, 0, thread, threads);
{
uint2x4 state1[3], state2[3];
const uint32_t s1 = ps1 + i*memshift; #pragma unroll
const uint32_t s2 = ps2 + i*memshift; for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + last[j];
for (int j = 0; j < 3; j++) { round_lyra(state);
state1[j] = __ldg4(&(DMatrix + s1)[j]);
state2[j] = __ldg4(&(DMatrix + s2)[j]);
}
#pragma unroll //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
for (int j = 0; j < 3; j++) { uint2 Data0 = state[0];
state1[j] += state2[j]; uint2 Data1 = state[1];
state[j] ^= state1[j]; uint2 Data2 = state[2];
} WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
round_lyra(state); if (threadIdx.x == 0)
{
last[0] ^= Data2;
last[1] ^= Data0;
last[2] ^= Data1;
} else {
last[0] ^= Data0;
last[1] ^= Data1;
last[2] ^= Data2;
}
((uint2*)state2)[0] ^= ((uint2*)state)[11]; if (rowInOut == 5)
{
#pragma unroll
for (int j = 0; j < 3; j++)
last[j] ^= state[j];
}
for (int j = 0; j < 11; j++) for (int i = 1; i < Nrow; i++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; {
LD4S(state1, 2, i, thread, threads);
LD4S(state2, rowInOut, i, thread, threads);
if (rowInOut == rowOut) { #pragma unroll
for (int j = 0; j < 3; j++) { for (int j = 0; j < 3; j++)
state2[j] ^= state[j]; state[j] ^= state1[j] + state2[j];
(DMatrix + s2)[j]=state2[j];
} round_lyra(state);
} else {
const uint32_t s3 = ps3 + i*memshift;
for (int j = 0; j < 3; j++) {
(DMatrix + s2)[j] = state2[j];
(DMatrix + s3)[j] ^= state[j];
}
}
} }
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= last[j];
} }
#if __CUDA_ARCH__ == 500 __constant__ uint2x4 blake2b_IV[2] = {
__global__ __launch_bounds__(TPB50, 1) 0xf3bcc908lu, 0x6a09e667lu,
#else 0x84caa73blu, 0xbb67ae85lu,
__global__ __launch_bounds__(TPB52, 2) 0xfe94f82blu, 0x3c6ef372lu,
#endif 0x5f1d36f1lu, 0xa54ff53alu,
void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) 0xade682d1lu, 0x510e527flu,
0x2b3e6c1flu, 0x9b05688clu,
0xfb41bd6blu, 0x1f83d9ablu,
0x137e2179lu, 0x5be0cd19lu
};
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint2x4 blake2b_IV[2] = {
{{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }},
{{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }}
};
if (thread < threads) if (thread < threads)
{ {
uint2x4 state[4]; uint2x4 state[4];
((uint2*)state)[0] = __ldg(&g_hash[thread]); state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]); state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]); state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]);
state[1] = state[0];
state[2] = blake2b_IV[0]; state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1]; state[3] = blake2b_IV[1];
for (int i = 0; i<24; i++) for (int i = 0; i<24; i++)
round_lyra(state); //because 12 is not enough round_lyra(state); //because 12 is not enough
const uint32_t ps1 = (memshift * 7 + 256 * thread); ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
for (int i = 0; i < 8; i++) ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
{ ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
const uint32_t s1 = ps1 - memshift * i; ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
for (int j = 0; j < 3; j++) }
(DMatrix + s1)[j] = (state)[j]; }
round_lyra(state);
}
reduceDuplex(state, thread); __global__
__launch_bounds__(TPB52, 1)
reduceDuplexRowSetup(1, 0, 2, state, thread); void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
reduceDuplexRowSetup(2, 1, 3, state, thread); {
reduceDuplexRowSetup(3, 0, 4, state, thread); const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
reduceDuplexRowSetup(4, 3, 5, state, thread);
reduceDuplexRowSetup(5, 2, 6, state, thread);
reduceDuplexRowSetup(6, 1, 7, state, thread);
uint32_t rowa = state[0].x.x & 7;
reduceDuplexRowt(7, rowa, 0, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(0, rowa, 3, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(3, rowa, 6, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(6, rowa, 1, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(1, rowa, 4, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(4, rowa, 7, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(7, rowa, 2, state, thread);
rowa = state[0].x.x & 7;
reduceDuplexRowt(2, rowa, 5, state, thread);
const int32_t shift = (memshift * 8 * rowa + 256 * thread);
#pragma unroll if (thread < threads)
for (int j = 0; j < 3; j++) {
state[j] ^= __ldg4(&(DMatrix + shift)[j]); uint2 state[4];
state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
reduceDuplex(state, thread, threads);
reduceDuplexRowSetup(1, 0, 2, state, thread, threads);
reduceDuplexRowSetup(2, 1, 3, state, thread, threads);
reduceDuplexRowSetup(3, 0, 4, state, thread, threads);
reduceDuplexRowSetup(4, 3, 5, state, thread, threads);
reduceDuplexRowSetup(5, 2, 6, state, thread, threads);
reduceDuplexRowSetup(6, 1, 7, state, thread, threads);
uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(7, rowa, 0, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(0, rowa, 3, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(3, rowa, 6, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(6, rowa, 1, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(1, rowa, 4, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(4, rowa, 7, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt(7, rowa, 2, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowt_8(rowa, state, thread, threads);
DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
uint28 state[4];
if (thread < threads)
{
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
for (int i = 0; i < 12; i++) for (int i = 0; i < 12; i++)
round_lyra(state); round_lyra(state);
g_hash[thread] = ((uint2*)state)[0]; g_hash[thread + threads * 0] = state[0].x;
g_hash[thread + threads] = ((uint2*)state)[1]; g_hash[thread + threads * 1] = state[0].y;
g_hash[thread + threads*2] = ((uint2*)state)[2]; g_hash[thread + threads * 2] = state[0].z;
g_hash[thread + threads*3] = ((uint2*)state)[3]; g_hash[thread + threads * 3] = state[0].w;
}
} //thread
} }
#else #else
#if __CUDA_ARCH__ < 500
/* for unsupported SM arch */ /* for unsupported SM arch */
__device__ void* DMatrix; __device__ void* DMatrix;
__global__ void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} #endif
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
#endif #endif
__host__ __host__
void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix) void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
{ {
cuda_get_arch(thr_id); // just assign the device pointer allocated in main loop
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
} }
__host__ __host__
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order) void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
{ {
int dev_id = device_map[thr_id % MAX_GPUS]; int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t tpb = TPB52; uint32_t tpb = TPB52;
if (device_sm[dev_id] == 500) tpb = TPB50;
if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not)
if (device_sm[dev_id] <= 300) tpb = TPB30;
dim3 grid((threads + tpb - 1) / tpb); if (cuda_arch[dev_id] >= 520) tpb = TPB52;
dim3 block(tpb); else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
if (device_sm[dev_id] >= 500) dim3 grid1((threads * 4 + tpb - 1) / tpb);
lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_hash); dim3 block1(4, tpb >> 2);
else
lyra2_gpu_hash_32_sm2 <<< grid, block >>> (threads, startNounce, d_hash); dim3 grid2((threads + 64 - 1) / 64);
dim3 block2(64);
dim3 grid3((threads + tpb - 1) / tpb);
dim3 block3(tpb);
if (cuda_arch[dev_id] >= 520)
{
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash);
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
}
else if (cuda_arch[dev_id] >= 500)
{
size_t shared_mem = 0;
if (gtx750ti)
// 8Warpに調整のため、8192バイト確保する
shared_mem = 8192;
else
// 10Warpに調整のため、6144バイト確保する
shared_mem = 6144;
lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash);
lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
}
else
lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash);
} }

7
lyra2/cuda_lyra2_sm2.cuh

@ -3,15 +3,16 @@
#ifdef __INTELLISENSE__ #ifdef __INTELLISENSE__
/* just for vstudio code colors */ /* just for vstudio code colors */
#undef __CUDA_ARCH__ #undef __CUDA_ARCH__
#define __CUDA_ARCH__ 300 #define __CUDA_ARCH__ 500
#endif #endif
#include "cuda_helper.h" #include "cuda_helper.h"
#define TPB30 160 #define TPB30 160
#define TPB20 160
#if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__) #if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__)
__constant__ static uint2 blake2b_IV[8] = { __constant__ static uint2 blake2b_IV_sm2[8] = {
{ 0xf3bcc908, 0x6a09e667 }, { 0xf3bcc908, 0x6a09e667 },
{ 0x84caa73b, 0xbb67ae85 }, { 0x84caa73b, 0xbb67ae85 },
{ 0xfe94f82b, 0x3c6ef372 }, { 0xfe94f82b, 0x3c6ef372 },
@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h
#pragma unroll #pragma unroll
for (int i = 0; i<8; i++) { for (int i = 0; i<8; i++) {
state[i + 8] = blake2b_IV[i]; state[i + 8] = blake2b_IV_sm2[i];
} }
// blake2blyra x2 // blake2blyra x2

693
lyra2/cuda_lyra2_sm5.cuh

@ -0,0 +1,693 @@
#include <memory.h>
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#undef __CUDA_ARCH__
#define __CUDA_ARCH__ 500
#endif
#include "cuda_helper.h"
#define TPB50 32
#if __CUDA_ARCH__ == 500
#include "cuda_lyra2_vectors.h"
#define Nrow 8
#define Ncol 8
#define memshift 3
__device__ uint2 *DMatrix;
__device__ __forceinline__ uint2 LD4S(const int index)
{
extern __shared__ uint2 shared_mem[];
return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
}
__device__ __forceinline__ void ST4S(const int index, const uint2 data)
{
extern __shared__ uint2 shared_mem[];
shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
}
#if __CUDA_ARCH__ == 300
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
return __shfl(a, b, c);
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
a1 = WarpShuffle(a1, b1, c);
a2 = WarpShuffle(a2, b2, c);
a3 = WarpShuffle(a3, b3, c);
}
#else
__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
uint32_t *_ptr = (uint32_t*)shared_mem;
__threadfence_block();
uint32_t buf = _ptr[thread];
_ptr[thread] = a;
__threadfence_block();
uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
_ptr[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a;
__threadfence_block();
uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
return result;
}
__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
{
extern __shared__ uint2 shared_mem[];
const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
__threadfence_block();
uint2 buf = shared_mem[thread];
shared_mem[thread] = a1;
__threadfence_block();
a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
__threadfence_block();
shared_mem[thread] = a2;
__threadfence_block();
a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
__threadfence_block();
shared_mem[thread] = a3;
__threadfence_block();
a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
__threadfence_block();
shared_mem[thread] = buf;
__threadfence_block();
}
#endif
#if __CUDA_ARCH__ >= 300
static __device__ __forceinline__
void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
{
a += b; d ^= a; d = SWAPUINT2(d);
c += d; b ^= c; b = ROR24(b); //ROR2(b, 24);
a += b; d ^= a; d = ROR16(d);
c += d; b ^= c; b = ROR2(b, 63);
}
#endif
__device__ __forceinline__ void round_lyra(uint2 s[4])
{
Gfunc(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
Gfunc(s[0], s[1], s[2], s[3]);
WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
}
static __device__ __forceinline__
void round_lyra(uint2x4* s)
{
Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
}
static __device__ __forceinline__
void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads)
{
uint2 state1[3], state2[3];
const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x;
for (int i = 0; i < 8; i++)
{
const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift;
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s0 + j, state[j]);
round_lyra(state);
}
for (int i = 0; i < 8; i++)
{
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = LD4S(s0 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j];
}
// 1, 0, 2
for (int i = 0; i < 8; i++)
{
const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = LD4S(s0 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
ST4S(s0 + j, state2[j]);
}
// 2, 1, 3
for (int i = 0; i < 8; i++)
{
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s2 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
} else {
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
}
// 3, 0, 4
for (int i = 0; i < 8; i++)
{
const uint32_t ls0 = memshift * Ncol * 0 + i * memshift;
const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x;
const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s3 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = LD4S(ls0 + j);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
} else {
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s0 + j*threads*blockDim.x) = state2[j];
}
// 4, 3, 5
for (int i = 0; i < 8; i++)
{
const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x;
const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s4 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = *(DMatrix + s3 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s3 + j*threads*blockDim.x) = state2[j];
}
// 5, 2, 6
for (int i = 0; i < 8; i++)
{
const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x;
const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s5 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
}
else
{
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
}
// 6, 1, 7
for (int i = 0; i < 8; i++)
{
const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x;
const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state1[j] = *(DMatrix + s6 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j];
round_lyra(state);
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j];
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
} else {
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
}
}
static __device__ __forceinline__
void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
{
const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x;
#pragma unroll 1
for (int i = 0; i < 8; i++)
{
uint2 state1[3], state2[3];
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
}
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] += state2[j];
state[j] ^= state1[j];
}
round_lyra(state);
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
state2[0] ^= Data2;
state2[1] ^= Data0;
state2[2] ^= Data1;
} else {
state2[0] ^= Data0;
state2[1] ^= Data1;
state2[2] ^= Data2;
}
#pragma unroll
for (int j = 0; j < 3; j++)
{
*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j];
}
}
}
static __device__ __forceinline__
void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
{
const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x;
const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
uint2 state1[3], last[3];
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
}
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] += last[j];
state[j] ^= state1[j];
}
round_lyra(state);
//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
uint2 Data0 = state[0];
uint2 Data1 = state[1];
uint2 Data2 = state[2];
WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
if (threadIdx.x == 0)
{
last[0] ^= Data2;
last[1] ^= Data0;
last[2] ^= Data1;
} else {
last[0] ^= Data0;
last[1] ^= Data1;
last[2] ^= Data2;
}
if (rowInOut == 5)
{
#pragma unroll
for (int j = 0; j < 3; j++)
last[j] ^= state[j];
}
for (int i = 1; i < 8; i++)
{
const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
round_lyra(state);
}
#pragma unroll
for (int j = 0; j < 3; j++)
state[j] ^= last[j];
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint2x4 blake2b_IV[2] = {
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
};
if (thread < threads)
{
uint2x4 state[4];
((uint2*)state)[0] = __ldg(&g_hash[thread]);
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]);
((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]);
state[1] = state[0];
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i < 24; i++)
round_lyra(state); //because 12 is not enough
((uint2x4*)DMatrix)[0 * threads + thread] = state[0];
((uint2x4*)DMatrix)[1 * threads + thread] = state[1];
((uint2x4*)DMatrix)[2 * threads + thread] = state[2];
((uint2x4*)DMatrix)[3 * threads + thread] = state[3];
}
}
__global__ __launch_bounds__(TPB50, 1)
void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
if (thread < threads)
{
uint2 state[4];
state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]);
reduceDuplexV5(state, thread, threads);
uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(7, rowa, 0, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(0, rowa, 3, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(3, rowa, 6, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(6, rowa, 1, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(1, rowa, 4, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(4, rowa, 7, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50(7, rowa, 2, state, thread, threads);
rowa = WarpShuffle(state[0].x, 0, 4) & 7;
reduceDuplexRowV50_8(rowa, state, thread, threads);
DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0];
DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1];
DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2];
DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3];
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]);
for (int i = 0; i < 12; i++)
round_lyra(state);
g_hash[thread] = ((uint2*)state)[0];
g_hash[thread + threads] = ((uint2*)state)[1];
g_hash[thread + threads * 2] = ((uint2*)state)[2];
g_hash[thread + threads * 3] = ((uint2*)state)[3];
}
}
#else
/* if __CUDA_ARCH__ != 500 .. host */
__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
#endif

2
lyra2/cuda_lyra2_vectors.h

@ -13,7 +13,7 @@
#include "cuda_helper.h" #include "cuda_helper.h"
#if __CUDA_ARCH__ < 300 #if __CUDA_ARCH__ < 300
#define __shfl(x, y) (x) #define __shfl(x, y, z) (x)
#endif #endif
#if __CUDA_ARCH__ < 320 && !defined(__ldg4) #if __CUDA_ARCH__ < 320 && !defined(__ldg4)

63
lyra2/lyra2RE.cu

@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_cpu_init(int thr_id, uint32_t threads);
extern void groestl256_cpu_free(int thr_id); extern void groestl256_cpu_free(int thr_id);
@ -79,36 +79,55 @@ extern "C" void lyra2re_hash(void *state, const void *input)
} }
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
static uint32_t throughput[MAX_GPUS] = { 0 };
extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{ {
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 17 : 16;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
ptarget[7] = 0x000f; ptarget[7] = 0x00ff;
static __thread bool gtx750ti;
if (!init[thr_id]) if (!init[thr_id])
{ {
cudaSetDevice(device_map[thr_id]); int dev_id = device_map[thr_id];
cudaSetDevice(dev_id);
CUDA_LOG_ERROR(); CUDA_LOG_ERROR();
blake256_cpu_init(thr_id, throughput); int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
keccak256_cpu_init(thr_id,throughput); if (device_sm[device_map[thr_id]] == 500) intensity = 15;
skein256_cpu_init(thr_id, throughput); int temp = intensity;
groestl256_cpu_init(thr_id, throughput); throughput[thr_id] = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
if (init[thr_id]) throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);
// DMatrix cudaDeviceProp props;
cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput); cudaGetDeviceProperties(&props, dev_id);
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); if (strstr(props.name, "750 Ti")) gtx750ti = true;
else gtx750ti = false;
blake256_cpu_init(thr_id, throughput[thr_id]);
keccak256_cpu_init(thr_id, throughput[thr_id]);
skein256_cpu_init(thr_id, throughput[thr_id]);
groestl256_cpu_init(thr_id, throughput[thr_id]);
if (device_sm[dev_id] >= 500)
{
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
lyra2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));
init[thr_id] = true; init[thr_id] = true;
if (temp != intensity){
gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
intensity, throughput[thr_id]);
}
} }
uint32_t _ALIGN(128) endiandata[20]; uint32_t _ALIGN(128) endiandata[20];
@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
int order = 0; int order = 0;
uint32_t foundNonce; uint32_t foundNonce;
blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); blake256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); keccak256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); lyra2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], gtx750ti);
skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
TRACE("S") TRACE("S")
*hashes_done = pdata[19] - first_nonce + throughput; *hashes_done = pdata[19] - first_nonce + throughput[thr_id];
foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); foundNonce = groestl256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
if (foundNonce != UINT32_MAX) if (foundNonce != UINT32_MAX)
{ {
uint32_t _ALIGN(64) vhash64[8]; uint32_t _ALIGN(64) vhash64[8];
@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
} }
} }
if ((uint64_t)throughput + pdata[19] >= max_nonce) { if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
pdata[19] = max_nonce; pdata[19] = max_nonce;
break; break;
} }
pdata[19] += throughput; pdata[19] += throughput[thr_id];
} while (!work_restart[thr_id].restart); } while (!work_restart[thr_id].restart);

Loading…
Cancel
Save