diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu index f346a05..c3f89e6 100644 --- a/lyra2/cuda_lyra2v2.cu +++ b/lyra2/cuda_lyra2v2.cu @@ -1,3 +1,8 @@ +/** + * Lyra2 (v2) CUDA Implementation + * + * Based on djm34/VTC sources and incredible 2x boost by Nanashi Meiyo-Meijin (May 2016) + */ #include #include #include @@ -17,6 +22,8 @@ #define Ncol 4 #define memshift 3 +#define TPB 32 + __device__ uint2x4 *DMatrix; __device__ __forceinline__ uint2 LD4S(const int index) @@ -303,7 +310,7 @@ void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4]) } __global__ -__launch_bounds__(32, 1) +__launch_bounds__(TPB, 1) void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) { const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; @@ -342,15 +349,15 @@ void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) for (int i = 0; i<12; i++) round_lyra_v5(state); - DMatrix[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x] = state[0]; - DMatrix[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x] = state[1]; - DMatrix[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x] = state[2]; - DMatrix[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x] = state[3]; + DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0]; + DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1]; + DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2]; + DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3]; } } __global__ -__launch_bounds__(32, 1) +__launch_bounds__(TPB, 1) void lyra2v2_gpu_hash_32_2(uint32_t threads) { const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; @@ -386,7 +393,7 @@ void lyra2v2_gpu_hash_32_2(uint32_t threads) } __global__ -__launch_bounds__(32, 1) +__launch_bounds__(TPB, 1) void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) { const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; @@ -395,10 +402,10 @@ void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) if (thread < threads) { - state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x]); - state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x]); - state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x]); - state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x]); + state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]); + state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]); + state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]); + state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]); for (int i = 0; i < 12; i++) round_lyra_v5(state); @@ -436,7 +443,7 @@ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uin if (device_sm[dev_id] >= 500) { - const uint32_t tpb = 32; + const uint32_t tpb = TPB; dim3 grid2((threads + tpb - 1) / tpb); dim3 block2(tpb); diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index 2308d0c..f7342f2 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -96,7 +96,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; int dev_id = device_map[thr_id]; - int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18; + int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20; uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);