diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 400d1f7..f97035c 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -246,6 +246,7 @@ + @@ -524,4 +525,4 @@ - + \ No newline at end of file diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index bd135ac..82aed9b 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -79,6 +79,9 @@ {2037fd0d-e7a2-4da8-956c-150aec726a99} + + {1613763f-895c-4321-b58b-6f5849868956} + @@ -419,6 +422,12 @@ Header Files + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + @@ -607,21 +616,6 @@ Source Files\CUDA\Algo256 - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - Source Files\CUDA @@ -670,6 +664,18 @@ Source Files\CUDA\Algo256 + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + @@ -686,4 +692,4 @@ Ressources - + \ No newline at end of file diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index 82c31a8..c40b4a5 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -11,6 +11,11 @@ #include "cuda_lyra2_sm2.cuh" +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#endif + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500 #include "cuda_lyra2_vectors.h" diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh index b0ff1ec..7998d17 100644 --- a/lyra2/cuda_lyra2_sm2.cuh +++ b/lyra2/cuda_lyra2_sm2.cuh @@ -10,9 +10,8 @@ #define TPB30 160 -#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350 - -static __constant__ uint2 blake2b_IV[8] = { +#if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__) +__constant__ static uint2 blake2b_IV[8] = { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, @@ -22,6 +21,9 @@ static __constant__ uint2 blake2b_IV[8] = { { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }; +#endif + +#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350 #define reduceDuplexRow(rowIn, rowInOut, rowOut) { \ for (int i = 0; i < 8; i++) { \ diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu index 9661a9a..5493c1a 100644 --- a/lyra2/cuda_lyra2v2.cu +++ b/lyra2/cuda_lyra2v2.cu @@ -2,27 +2,26 @@ #include #include -#ifdef __INTELLISENSE__ -/* just for vstudio code colors */ -#define __CUDA_ARCH__ 500 -#endif - #define TPB52 10 #define TPB50 16 #include "cuda_lyra2v2_sm3.cuh" +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#endif + #if __CUDA_ARCH__ >= 500 #include "cuda_lyra2_vectors.h" #define Nrow 4 #define Ncol 4 -#define u64type uint2 -#define vectype uint28 +#define uint4x2 uint28 #define memshift 3 -__device__ vectype *DMatrix; +__device__ uint4x2 *DMatrix; __device__ __forceinline__ void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) @@ -34,7 +33,7 @@ void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) } __device__ __forceinline__ -void round_lyra_v5(vectype* s) +void round_lyra_v5(uint4x2* s) { Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x); Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y); @@ -48,9 +47,9 @@ void round_lyra_v5(vectype* s) } __device__ __forceinline__ -void reduceDuplex(vectype state[4], uint32_t thread) +void reduceDuplex(uint4x2 state[4], uint32_t thread) { - vectype state1[3]; + uint4x2 state1[3]; uint32_t ps1 = (Nrow * Ncol * memshift * thread); uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread); @@ -81,7 +80,7 @@ void reduceDuplex(vectype state[4], uint32_t thread) } __device__ __forceinline__ -void reduceDuplex50(vectype state[4], uint32_t thread) +void reduceDuplex50(uint4x2 state[4], uint32_t thread) { uint32_t ps1 = (Nrow * Ncol * memshift * thread); uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread); @@ -105,9 +104,9 @@ void reduceDuplex50(vectype state[4], uint32_t thread) } __device__ __forceinline__ -void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2 state[4], uint32_t thread) { - vectype state2[3], state1[3]; + uint4x2 state2[3], state1[3]; uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); @@ -152,7 +151,7 @@ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOu #pragma unroll for (int j = 0; j < 3; j++) { - vectype tmp = state1[j] + state2[j]; + uint4x2 tmp = state1[j] + state2[j]; state[j] ^= tmp; } @@ -180,9 +179,9 @@ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOu __device__ __forceinline__ -void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2* state, uint32_t thread) { - vectype state1[3],state2[3]; + uint4x2 state1[3],state2[3]; uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); @@ -260,9 +259,9 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - vectype state[4]; + uint4x2 state[4]; - uint28 blake2b_IV[2]; + uint4x2 blake2b_IV[2]; if (threadIdx.x == 0) { @@ -344,7 +343,9 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa } #else #include "cuda_helper.h" +#if __CUDA_ARCH__ < 300 __device__ void* DMatrix; +#endif __global__ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} #endif diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh index e9d992d..e928e75 100644 --- a/lyra2/cuda_lyra2v2_sm3.cuh +++ b/lyra2/cuda_lyra2v2_sm3.cuh @@ -11,6 +11,8 @@ #if __CUDA_ARCH__ >= 300 && __CUDA_ARCH__ < 500 +#include "cuda_lyra2_vectors.h" + #define Nrow 4 #define Ncol 4