Browse Source

lyra2: improve cuda implementation (part 1, SM5+)

based on the new djm34 method, 2x faster than first version

cleaned and tuned for the GTX 750/960 (linux / cuda 6.5)
master
Tanguy Pruvot 9 years ago
parent
commit
fc84c719e9
  1. 4
      Algo256/cuda_groestl256.cu
  2. 3
      README.txt
  3. 1
      bench.cpp
  4. 1
      configure.sh
  5. 385
      lyra2/cuda_lyra2.cu
  6. 22
      lyra2/lyra2RE.cu
  7. 29
      lyra2/lyra2REv2.cu

4
Algo256/cuda_groestl256.cu

@ -176,7 +176,7 @@ void groestl256_perm_Q(uint32_t thread, uint32_t *a, char *mixtabs)
} }
__global__ __launch_bounds__(256,1) __global__ __launch_bounds__(256,1)
void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *resNonces) void groestl256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *resNonces)
{ {
#if USE_SHARED #if USE_SHARED
extern __shared__ char mixtabs[]; extern __shared__ char mixtabs[];
@ -315,7 +315,7 @@ uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNoun
#else #else
size_t shared_size = 0; size_t shared_size = 0;
#endif #endif
groestl256_gpu_hash32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonces[thr_id]); groestl256_gpu_hash_32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonces[thr_id]);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);

3
README.txt

@ -229,9 +229,10 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Under Dev... v1.7 Under Dev... v1.7
Improve lyra2 (v1) cuda implementation
Restore whirlpool algo (and whirlcoin variant) Restore whirlpool algo (and whirlcoin variant)
Prepare algo switch ability Prepare algo switch ability
Add --benchmark -a all to run a benchmark for all algos Add --benchmark alone to run a benchmark for all algos
Add --cuda-schedule parameter Add --cuda-schedule parameter
Add --show-diff parameter, which display shares diff, Add --show-diff parameter, which display shares diff,
and is able to detect real solved blocks on pools. and is able to detect real solved blocks on pools.

1
bench.cpp

@ -98,7 +98,6 @@ bool bench_algo_switch_next(int thr_id)
if (algo == ALGO_DMD_GR) algo++; // same as groestl if (algo == ALGO_DMD_GR) algo++; // same as groestl
if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool
// and unwanted ones... // and unwanted ones...
if (algo == ALGO_LYRA2) algo++; // weird memory leak to fix (uint2 Matrix[96][8] too big)
if (algo == ALGO_SCRYPT) algo++; if (algo == ALGO_SCRYPT) algo++;
if (algo == ALGO_SCRYPT_JANE) algo++; if (algo == ALGO_SCRYPT_JANE) algo++;

1
configure.sh

@ -1,4 +1,5 @@
# possible additional CUDA_CFLAGS # possible additional CUDA_CFLAGS
#-gencode=arch=compute_52,code=\"sm_52,compute_52\"
#-gencode=arch=compute_50,code=\"sm_50,compute_50\" #-gencode=arch=compute_50,code=\"sm_50,compute_50\"
#-gencode=arch=compute_35,code=\"sm_35,compute_35\" #-gencode=arch=compute_35,code=\"sm_35,compute_35\"
#-gencode=arch=compute_30,code=\"sm_30,compute_30\" #-gencode=arch=compute_30,code=\"sm_30,compute_30\"

385
lyra2/cuda_lyra2.cu

@ -1,68 +1,23 @@
/**
* Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2
* tpruvot@github 2015
*/
#include <stdio.h>
#include <memory.h> #include <memory.h>
#include "cuda_helper.h" #include "cuda_lyra2_vectors.h"
#define TPB 160 #define TPB50 16
#define TPB52 8
static __constant__ uint2 blake2b_IV[8] = { #define uint2x4 uint28
{ 0xf3bcc908, 0x6a09e667 }, #define memshift 3
{ 0x84caa73b, 0xbb67ae85 },
{ 0xfe94f82b, 0x3c6ef372 },
{ 0x5f1d36f1, 0xa54ff53a },
{ 0xade682d1, 0x510e527f },
{ 0x2b3e6c1f, 0x9b05688c },
{ 0xfb41bd6b, 0x1f83d9ab },
{ 0x137e2179, 0x5be0cd19 }
};
#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ #define Ncol 8
for (int i = 0; i < 8; i++) { \ #define NcolMask 0x7
for (int j = 0; j < 12; j++) \
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \ __device__ uint2x4* DMatrix;
round_lyra(state); \
for (int j = 0; j < 12; j++) \
Matrix[j + 12 * i][rowOut] ^= state[j]; \
Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
} \
}
#define absorbblock(in) { \
state[0] ^= Matrix[0][in]; \
state[1] ^= Matrix[1][in]; \
state[2] ^= Matrix[2][in]; \
state[3] ^= Matrix[3][in]; \
state[4] ^= Matrix[4][in]; \
state[5] ^= Matrix[5][in]; \
state[6] ^= Matrix[6][in]; \
state[7] ^= Matrix[7][in]; \
state[8] ^= Matrix[8][in]; \
state[9] ^= Matrix[9][in]; \
state[10] ^= Matrix[10][in]; \
state[11] ^= Matrix[11][in]; \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
round_lyra(state); \
}
static __device__ __forceinline__ static __device__ __forceinline__
void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d) void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
@ -73,151 +28,233 @@ void Gfunc(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
c += d; b ^= c; b = ROR2(b, 63); c += d; b ^= c; b = ROR2(b, 63);
} }
__device__ __forceinline__ static __device__ __forceinline__
static void round_lyra(uint2 *s) void round_lyra(uint2x4* s)
{ {
Gfunc(s[0], s[4], s[8], s[12]); Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
Gfunc(s[1], s[5], s[9], s[13]); Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
Gfunc(s[2], s[6], s[10], s[14]); Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
Gfunc(s[3], s[7], s[11], s[15]); Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
Gfunc(s[0], s[5], s[10], s[15]); Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
Gfunc(s[1], s[6], s[11], s[12]); Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
Gfunc(s[2], s[7], s[8], s[13]); Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
Gfunc(s[3], s[4], s[9], s[14]); Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
} }
__device__ __forceinline__ static __device__ __forceinline__
void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8]) void reduceDuplex(uint2x4 state[4], uint32_t thread)
{ {
#if __CUDA_ARCH__ > 500 uint2x4 state1[3];
#pragma unroll
#endif const uint32_t ps1 = (256 * thread);
const uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread);
#pragma unroll 4
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
{ {
#pragma unroll const uint32_t s1 = ps1 + i*memshift;
for (int j = 0; j < 12; j++) const uint32_t s2 = ps2 - i*memshift;
state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut];
for (int j = 0; j < 3; j++)
state1[j] = __ldg4(&(DMatrix+s1)[j]);
for (int j = 0; j < 3; j++)
state[j] ^= state1[j];
round_lyra(state); round_lyra(state);
#pragma unroll for (int j = 0; j < 3; j++)
for (int j = 0; j < 12; j++) state1[j] ^= state[j];
Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state1[j];
Matrix[0 + 12 * i][rowInOut] ^= state[11]; }
Matrix[1 + 12 * i][rowInOut] ^= state[0]; }
Matrix[2 + 12 * i][rowInOut] ^= state[1];
Matrix[3 + 12 * i][rowInOut] ^= state[2];
Matrix[4 + 12 * i][rowInOut] ^= state[3];
Matrix[5 + 12 * i][rowInOut] ^= state[4];
Matrix[6 + 12 * i][rowInOut] ^= state[5];
Matrix[7 + 12 * i][rowInOut] ^= state[6];
Matrix[8 + 12 * i][rowInOut] ^= state[7];
Matrix[9 + 12 * i][rowInOut] ^= state[8];
Matrix[10 + 12 * i][rowInOut] ^= state[9];
Matrix[11 + 12 * i][rowInOut] ^= state[10];
}
}
__global__ __launch_bounds__(TPB, 1)
void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2 state[16];
#pragma unroll static __device__ __forceinline__
for (int i = 0; i<4; i++) { void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], uint32_t thread)
LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); {
} //password uint2x4 state1[3], state2[3];
#pragma unroll const uint32_t ps1 = ( memshift*8 * rowIn + 256 * thread);
for (int i = 0; i<4; i++) { const uint32_t ps2 = ( memshift*8 * rowInOut + 256 * thread);
state[i + 4] = state[i]; const uint32_t ps3 = (memshift*7 + memshift*8 * rowOut + 256 * thread);
} //salt
#pragma unroll #pragma unroll 1
for (int i = 0; i<8; i++) { for (int i = 0; i < 8; i++)
state[i + 8] = blake2b_IV[i]; {
const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
for (int j = 0; j < 3; j++)
state1[j]= __ldg4(&(DMatrix + s1)[j]);
for (int j = 0; j < 3; j++)
state2[j]= __ldg4(&(DMatrix + s2)[j]);
for (int j = 0; j < 3; j++) {
uint2x4 tmp = state1[j] + state2[j];
state[j] ^= tmp;
} }
// blake2blyra x2
//#pragma unroll 24
for (int i = 0; i<24; i++) {
round_lyra(state); round_lyra(state);
} //because 12 is not enough
uint2 Matrix[96][8]; // not cool for (int j = 0; j < 3; j++) {
const uint32_t s3 = ps3 - i*memshift;
state1[j] ^= state[j];
(DMatrix + s3)[j] = state1[j];
}
((uint2*)state2)[0] ^= ((uint2*)state)[11];
for (int j = 0; j < 11; j++)
((uint2*)state2)[j+1] ^= ((uint2*)state)[j];
for (int j = 0; j < 3; j++)
(DMatrix + s2)[j] = state2[j];
}
}
// reducedSqueezeRow0 static __device__ __forceinline__
#pragma unroll 8 void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread)
{
const uint32_t ps1 = (memshift * 8 * rowIn + 256 * thread);
const uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread);
const uint32_t ps3 = (memshift * 8 * rowOut + 256 * thread);
#pragma unroll 1
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
{ {
#pragma unroll 12 uint2x4 state1[3], state2[3];
for (int j = 0; j<12; j++) {
Matrix[j + 84 - 12 * i][0] = state[j]; const uint32_t s1 = ps1 + i*memshift;
const uint32_t s2 = ps2 + i*memshift;
for (int j = 0; j < 3; j++) {
state1[j] = __ldg4(&(DMatrix + s1)[j]);
state2[j] = __ldg4(&(DMatrix + s2)[j]);
} }
#pragma unroll
for (int j = 0; j < 3; j++) {
state1[j] += state2[j];
state[j] ^= state1[j];
}
round_lyra(state); round_lyra(state);
((uint2*)state2)[0] ^= ((uint2*)state)[11];
for (int j = 0; j < 11; j++)
((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
if (rowInOut == rowOut) {
for (int j = 0; j < 3; j++) {
state2[j] ^= state[j];
(DMatrix + s2)[j]=state2[j];
}
} else {
const uint32_t s3 = ps3 + i*memshift;
for (int j = 0; j < 3; j++) {
(DMatrix + s2)[j] = state2[j];
(DMatrix + s3)[j] ^= state[j];
} }
}
}
}
#if __CUDA_ARCH__ == 500
__global__ __launch_bounds__(TPB50, 1)
#else
__global__ __launch_bounds__(TPB52, 2)
#endif
void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint2x4 blake2b_IV[2] = {
{{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }},
{{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }}
};
if (thread < threads)
{
uint2x4 state[4];
((uint2*)state)[0] = __ldg(&g_hash[thread]);
((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
state[1] = state[0];
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<24; i++)
round_lyra(state); //because 12 is not enough
// reducedSqueezeRow1 const uint32_t ps1 = (memshift * 7 + 256 * thread);
#pragma unroll 8
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
{ {
#pragma unroll 12 const uint32_t s1 = ps1 - memshift * i;
for (int j = 0; j<12; j++) { for (int j = 0; j < 3; j++)
state[j] ^= Matrix[j + 12 * i][0]; (DMatrix + s1)[j] = (state)[j];
}
round_lyra(state); round_lyra(state);
#pragma unroll 12 }
for (int j = 0; j<12; j++) {
Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; reduceDuplex(state, thread);
}
} reduceDuplexRowSetup(1, 0, 2, state, thread);
reduceDuplexRowSetup(2, 1, 3, state, thread);
reduceDuplexRowSetup(1, 0, 2,state, Matrix); reduceDuplexRowSetup(3, 0, 4, state, thread);
reduceDuplexRowSetup(2, 1, 3, state, Matrix); reduceDuplexRowSetup(4, 3, 5, state, thread);
reduceDuplexRowSetup(3, 0, 4, state, Matrix); reduceDuplexRowSetup(5, 2, 6, state, thread);
reduceDuplexRowSetup(4, 3, 5, state, Matrix); reduceDuplexRowSetup(6, 1, 7, state, thread);
reduceDuplexRowSetup(5, 2, 6, state, Matrix);
reduceDuplexRowSetup(6, 1, 7, state, Matrix); uint32_t rowa = state[0].x.x & 7;
reduceDuplexRowt(7, rowa, 0, state, thread);
uint32_t rowa; rowa = state[0].x.x & 7;
rowa = state[0].x & 7; reduceDuplexRowt(0, rowa, 3, state, thread);
reduceDuplexRow(7, rowa, 0); rowa = state[0].x.x & 7;
rowa = state[0].x & 7; reduceDuplexRowt(3, rowa, 6, state, thread);
reduceDuplexRow(0, rowa, 3); rowa = state[0].x.x & 7;
rowa = state[0].x & 7; reduceDuplexRowt(6, rowa, 1, state, thread);
reduceDuplexRow(3, rowa, 6); rowa = state[0].x.x & 7;
rowa = state[0].x & 7; reduceDuplexRowt(1, rowa, 4, state, thread);
reduceDuplexRow(6, rowa, 1); rowa = state[0].x.x & 7;
rowa = state[0].x & 7; reduceDuplexRowt(4, rowa, 7, state, thread);
reduceDuplexRow(1, rowa, 4); rowa = state[0].x.x & 7;
rowa = state[0].x & 7; reduceDuplexRowt(7, rowa, 2, state, thread);
reduceDuplexRow(4, rowa, 7); rowa = state[0].x.x & 7;
rowa = state[0].x & 7; reduceDuplexRowt(2, rowa, 5, state, thread);
reduceDuplexRow(7, rowa, 2);
rowa = state[0].x & 7; const int32_t shift = (memshift * 8 * rowa + 256 * thread);
reduceDuplexRow(2, rowa, 5);
absorbblock(rowa);
#pragma unroll #pragma unroll
for (int i = 0; i<4; i++) { for (int j = 0; j < 3; j++)
outputHash[threads*i + thread] = devectorize(state[i]); state[j] ^= __ldg4(&(DMatrix + shift)[j]);
} //password
} //thread for (int i = 0; i < 12; i++)
round_lyra(state);
g_hash[thread] = ((uint2*)state)[0];
g_hash[thread + threads] = ((uint2*)state)[1];
g_hash[thread + threads*2] = ((uint2*)state)[2];
g_hash[thread + threads*3] = ((uint2*)state)[3];
}
}
__host__
void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix)
{
cuda_get_arch(thr_id);
cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
} }
__host__ __host__
void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
{ {
const uint32_t threadsperblock = TPB; int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t tpb = TPB52;
if (device_sm[dev_id] == 500) tpb = TPB50;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + tpb - 1) / tpb);
dim3 block(threadsperblock); dim3 block(tpb);
lyra2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash); lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_hash);
} }

22
lyra2/lyra2RE.cu

@ -10,7 +10,7 @@ extern "C" {
#include "cuda_helper.h" #include "cuda_helper.h"
static uint64_t* d_hash[MAX_GPUS]; static uint64_t* d_hash[MAX_GPUS];
//static uint64_t* d_matrix[MAX_GPUS]; static uint64_t* d_matrix[MAX_GPUS];
extern void blake256_cpu_init(int thr_id, uint32_t threads); extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
@ -21,7 +21,7 @@ extern void keccak256_cpu_free(int thr_id);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void skein256_cpu_init(int thr_id, uint32_t threads);
//extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *hash); extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_cpu_init(int thr_id, uint32_t threads);
@ -84,17 +84,17 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
uint32_t *pdata = work->data; uint32_t *pdata = work->data;
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 18 : 17; int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 17 : 16;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) if (opt_benchmark)
ptarget[7] = 0x00ff; ptarget[7] = 0x000f;
if (!init[thr_id]) if (!init[thr_id])
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
cudaGetLastError(); // reset last error CUDA_LOG_ERROR();
blake256_cpu_init(thr_id, throughput); blake256_cpu_init(thr_id, throughput);
keccak256_cpu_init(thr_id,throughput); keccak256_cpu_init(thr_id,throughput);
@ -102,8 +102,8 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
groestl256_cpu_init(thr_id, throughput); groestl256_cpu_init(thr_id, throughput);
// DMatrix // DMatrix
// cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput); cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput);
// lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
@ -147,7 +147,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
lyra2re_hash(vhash64, endiandata); lyra2re_hash(vhash64, endiandata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
if (opt_debug) if (opt_debug)
applog(LOG_BLUE, "GPU #%d: found second nonce %08x", device_map[thr_id], secNonce); gpulog(LOG_BLUE, thr_id, "found second nonce %08x", secNonce);
if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
work_set_target_ratio(work, vhash64); work_set_target_ratio(work, vhash64);
pdata[21] = secNonce; pdata[21] = secNonce;
@ -157,7 +157,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
pdata[19] = foundNonce; pdata[19] = foundNonce;
return res; return res;
} else { } else {
applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
} }
} }
@ -174,10 +174,10 @@ extern "C" void free_lyra2(int thr_id)
if (!init[thr_id]) if (!init[thr_id])
return; return;
cudaSetDevice(device_map[thr_id]); cudaThreadSynchronize();
cudaFree(d_hash[thr_id]); cudaFree(d_hash[thr_id]);
//cudaFree(d_matrix[thr_id]); cudaFree(d_matrix[thr_id]);
keccak256_cpu_free(thr_id); keccak256_cpu_free(thr_id);
groestl256_cpu_free(thr_id); groestl256_cpu_free(thr_id);

29
lyra2/lyra2REv2.cu

@ -92,27 +92,27 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
{ {
cudaSetDevice(dev_id); cudaSetDevice(dev_id);
//cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); //cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
//if (opt_n_gputhreads == 1) //if (gpu_threads == 1)
// cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); // cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
cudaGetLastError(); CUDA_LOG_ERROR();
blake256_cpu_init(thr_id, throughput); blake256_cpu_init(thr_id, throughput);
keccak256_cpu_init(thr_id,throughput); keccak256_cpu_init(thr_id,throughput);
skein256_cpu_init(thr_id, throughput); skein256_cpu_init(thr_id, throughput);
bmw256_cpu_init(thr_id, throughput); bmw256_cpu_init(thr_id, throughput);
if (device_sm[device_map[thr_id]] < 300) {
applog(LOG_ERR, "Device SM 3.0 or more recent required!");
proper_exit(1);
return -1;
}
// DMatrix (780Ti may prefer 16 instead of 12, cf djm34) // DMatrix (780Ti may prefer 16 instead of 12, cf djm34)
CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)12 * sizeof(uint64_t) * 4 * 4 * throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], (size_t)12 * sizeof(uint64_t) * 4 * 4 * throughput));
lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]); lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
if (device_sm[dev_id] < 300) {
applog(LOG_ERR, "Device SM 3.0 or more recent required!");
proper_exit(1);
return -1;
}
init[thr_id] = true; init[thr_id] = true;
} }
@ -153,18 +153,18 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
{ {
be32enc(&endiandata[19], foundNonces[1]); be32enc(&endiandata[19], foundNonces[1]);
lyra2v2_hash(vhash64, endiandata); lyra2v2_hash(vhash64, endiandata);
if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
work_set_target_ratio(work, vhash64);
pdata[21] = foundNonces[1]; pdata[21] = foundNonces[1];
//xchg(pdata[19], pdata[21]); if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) {
work_set_target_ratio(work, vhash64);
xchg(pdata[19], pdata[21]);
}
res++; res++;
} }
MyStreamSynchronize(NULL, 0, device_map[thr_id]);
return res; return res;
} }
else else
{ {
applog(LOG_WARNING, "GPU #%d: result does not validate on CPU!", dev_id); gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonces[0]);
} }
} }
@ -173,7 +173,6 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
} while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput))); } while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput)));
*hashes_done = pdata[19] - first_nonce + 1; *hashes_done = pdata[19] - first_nonce + 1;
MyStreamSynchronize(NULL, 0, device_map[thr_id]);
return 0; return 0;
} }
@ -183,7 +182,7 @@ extern "C" void free_lyra2v2(int thr_id)
if (!init[thr_id]) if (!init[thr_id])
return; return;
cudaSetDevice(device_map[thr_id]); cudaThreadSynchronize();
cudaFree(d_hash[thr_id]); cudaFree(d_hash[thr_id]);
cudaFree(d_matrix[thr_id]); cudaFree(d_matrix[thr_id]);

Loading…
Cancel
Save