Browse Source

lyra2v2: update credits, increase SM 5.0 default int to 19

+ small klausT cleanup..
2upstream
Tanguy Pruvot 8 years ago
parent
commit
bbd3c6d5b9
  1. 31
      lyra2/cuda_lyra2v2.cu
  2. 2
      lyra2/lyra2REv2.cu

31
lyra2/cuda_lyra2v2.cu

@ -1,3 +1,8 @@ @@ -1,3 +1,8 @@
/**
* Lyra2 (v2) CUDA Implementation
*
* Based on djm34/VTC sources and incredible 2x boost by Nanashi Meiyo-Meijin (May 2016)
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
@ -17,6 +22,8 @@ @@ -17,6 +22,8 @@
#define Ncol 4
#define memshift 3
#define TPB 32
__device__ uint2x4 *DMatrix;
__device__ __forceinline__ uint2 LD4S(const int index)
@ -303,7 +310,7 @@ void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4]) @@ -303,7 +310,7 @@ void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4])
}
__global__
__launch_bounds__(32, 1)
__launch_bounds__(TPB, 1)
void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
@ -342,15 +349,15 @@ void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) @@ -342,15 +349,15 @@ void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
for (int i = 0; i<12; i++)
round_lyra_v5(state);
DMatrix[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x] = state[0];
DMatrix[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x] = state[1];
DMatrix[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x] = state[2];
DMatrix[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x] = state[3];
DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
}
}
__global__
__launch_bounds__(32, 1)
__launch_bounds__(TPB, 1)
void lyra2v2_gpu_hash_32_2(uint32_t threads)
{
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
@ -386,7 +393,7 @@ void lyra2v2_gpu_hash_32_2(uint32_t threads) @@ -386,7 +393,7 @@ void lyra2v2_gpu_hash_32_2(uint32_t threads)
}
__global__
__launch_bounds__(32, 1)
__launch_bounds__(TPB, 1)
void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
{
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
@ -395,10 +402,10 @@ void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) @@ -395,10 +402,10 @@ void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
if (thread < threads)
{
state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x]);
state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x]);
state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x]);
state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x]);
state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
for (int i = 0; i < 12; i++)
round_lyra_v5(state);
@ -436,7 +443,7 @@ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uin @@ -436,7 +443,7 @@ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uin
if (device_sm[dev_id] >= 500) {
const uint32_t tpb = 32;
const uint32_t tpb = TPB;
dim3 grid2((threads + tpb - 1) / tpb);
dim3 block2(tpb);

2
lyra2/lyra2REv2.cu

@ -96,7 +96,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc @@ -96,7 +96,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

Loading…
Cancel
Save