Browse Source

lyra2v2: update credits, increase SM 5.0 default int to 19

+ small klausT cleanup..
master
Tanguy Pruvot 8 years ago
parent
commit
bbd3c6d5b9
  1. 31
      lyra2/cuda_lyra2v2.cu
  2. 2
      lyra2/lyra2REv2.cu

31
lyra2/cuda_lyra2v2.cu

@ -1,3 +1,8 @@
/**
* Lyra2 (v2) CUDA Implementation
*
* Based on djm34/VTC sources and incredible 2x boost by Nanashi Meiyo-Meijin (May 2016)
*/
#include <stdio.h> #include <stdio.h>
#include <stdint.h> #include <stdint.h>
#include <memory.h> #include <memory.h>
@ -17,6 +22,8 @@
#define Ncol 4 #define Ncol 4
#define memshift 3 #define memshift 3
#define TPB 32
__device__ uint2x4 *DMatrix; __device__ uint2x4 *DMatrix;
__device__ __forceinline__ uint2 LD4S(const int index) __device__ __forceinline__ uint2 LD4S(const int index)
@ -303,7 +310,7 @@ void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4])
} }
__global__ __global__
__launch_bounds__(32, 1) __launch_bounds__(TPB, 1)
void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
{ {
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
@ -342,15 +349,15 @@ void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
for (int i = 0; i<12; i++) for (int i = 0; i<12; i++)
round_lyra_v5(state); round_lyra_v5(state);
DMatrix[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x] = state[0]; DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
DMatrix[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x] = state[1]; DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
DMatrix[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x] = state[2]; DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
DMatrix[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x] = state[3]; DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
} }
} }
__global__ __global__
__launch_bounds__(32, 1) __launch_bounds__(TPB, 1)
void lyra2v2_gpu_hash_32_2(uint32_t threads) void lyra2v2_gpu_hash_32_2(uint32_t threads)
{ {
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
@ -386,7 +393,7 @@ void lyra2v2_gpu_hash_32_2(uint32_t threads)
} }
__global__ __global__
__launch_bounds__(32, 1) __launch_bounds__(TPB, 1)
void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
{ {
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
@ -395,10 +402,10 @@ void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
if (thread < threads) if (thread < threads)
{ {
state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x]); state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x]); state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x]); state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x]); state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
for (int i = 0; i < 12; i++) for (int i = 0; i < 12; i++)
round_lyra_v5(state); round_lyra_v5(state);
@ -436,7 +443,7 @@ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uin
if (device_sm[dev_id] >= 500) { if (device_sm[dev_id] >= 500) {
const uint32_t tpb = 32; const uint32_t tpb = TPB;
dim3 grid2((threads + tpb - 1) / tpb); dim3 grid2((threads + tpb - 1) / tpb);
dim3 block2(tpb); dim3 block2(tpb);

2
lyra2/lyra2REv2.cu

@ -96,7 +96,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
uint32_t *ptarget = work->target; uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id]; int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18; int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity); uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

Loading…
Cancel
Save