diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu index cda5f1c..dcea61f 100644 --- a/neoscrypt/cuda_neoscrypt.cu +++ b/neoscrypt/cuda_neoscrypt.cu @@ -15,6 +15,7 @@ typedef uint48 uint4x2; #ifdef __INTELLISENSE__ #define __CUDA_ARCH__ 500 #define __byte_perm(x,y,c) x +#define __shfl(x,y,c) x #define atomicExch(p,x) x #endif @@ -80,26 +81,6 @@ __constant__ uint32_t BLAKE2S_SIGMA[10][16] = { #define shf_r_clamp32(out,a,b,shift) \ asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(out) : "r"(a), "r"(b), "r"(shift)); -__device__ __forceinline__ -static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2) -{ -#if __CUDA_ARCH__ >= 320 - uint32_t shift = 32U - shift2; - asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift)); - asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift)); - asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift)); - asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift)); - asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift)); - asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift)); - asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift)); - asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift)); - asm("shr.b32 %0, %1, %2;" : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift)); -#else - // to check - shift256R(ret, vec4, shift2); -#endif -} - #if __CUDA_ARCH__ >= 300 __device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) { @@ -165,6 +146,26 @@ __device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_ #if __CUDA_ARCH__ < 500 +__device__ __forceinline__ +static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2) +{ +#if __CUDA_ARCH__ >= 320 + uint32_t shift = 32U - shift2; + asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift)); + asm("shr.b32 %0, %1, %2;" : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift)); +#else + // to check + shift256R(ret, vec4, shift2); +#endif +} + #define BLAKE(a, b, c, d, key1, key2) { \ a += key1; \ a += b; d = rotateL(d^a, 16); \ @@ -721,7 +722,6 @@ static __forceinline__ __device__ void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data) { uint2x4 output[8]; - uchar4 bufhelper; uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U]; uint32_t qbuf, rbuf, bitbuf; uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; @@ -787,6 +787,9 @@ void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const for (int k = 0; k<8; k++) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf)); +#else + //#error SM 3.0 code missing here + printf("", data18, data20); #endif Blake2S(input, input, key); } @@ -1095,6 +1098,7 @@ uint32_t fastkdf32_v1(uint32_t thread, const uint32_t nonce, uint32_t* const sal asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); #else //#error SM 3.0 code missing here + printf("", data18, data20); #endif for (int k = 0; k < 9; k++) { B0[(k + qbuf) & 0x3f] = temp[k]; @@ -1465,7 +1469,7 @@ static __thread uint32_t *Trans2 = NULL; // 2 streams static __thread uint32_t *Trans3 = NULL; // 2 streams __host__ -void neoscrypt_init_2stream(int thr_id, uint32_t threads) +void neoscrypt_init(int thr_id, uint32_t threads) { CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads))); @@ -1480,7 +1484,7 @@ void neoscrypt_init_2stream(int thr_id, uint32_t threads) } __host__ -void neoscrypt_free_2stream(int thr_id) +void neoscrypt_free(int thr_id) { cudaFree(d_NNonce[thr_id]); @@ -1491,18 +1495,15 @@ void neoscrypt_free_2stream(int thr_id) } __host__ -void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum) +void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum) { CUDA_SAFE_CALL(cudaMemset(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t))); - const int threadsperblock = TPB; - dim3 grid((threads + threadsperblock - 1) / threadsperblock); - dim3 block(threadsperblock); - const int threadsperblock2 = TPB2; dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2); dim3 block2(threadsperblock2); + const int threadsperblock = TPB; dim3 grid3((threads * 4 + threadsperblock - 1) / threadsperblock); dim3 block3(4, threadsperblock >> 2); diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp index ad094b6..fd38f23 100644 --- a/neoscrypt/neoscrypt.cpp +++ b/neoscrypt/neoscrypt.cpp @@ -6,9 +6,9 @@ extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget); -extern void neoscrypt_init_2stream(int thr_id, uint32_t threads); -extern void neoscrypt_free_2stream(int thr_id); -extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum); +extern void neoscrypt_init(int thr_id, uint32_t threads); +extern void neoscrypt_free(int thr_id); +extern void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum); static bool init[MAX_GPUS] = { 0 }; @@ -21,14 +21,12 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign int dev_id = device_map[thr_id]; int intensity = is_windows() ? 18 : 19; - if (strstr(device_name[dev_id], "GTX 10")) intensity = 20; // also need more than 2GB + if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); throughput = throughput / 32; /* set for max intensity ~= 20 */ api_set_throughput(thr_id, throughput); - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1); - if (opt_benchmark) ptarget[7] = 0x00ff; @@ -49,7 +47,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign } gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput); - neoscrypt_init_2stream(thr_id, throughput); + neoscrypt_init(thr_id, throughput); init[thr_id] = true; } @@ -66,7 +64,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign do { uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX }; - neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum); + neoscrypt_hash_k4(thr_id, throughput, pdata[19], foundNonces, have_stratum); *hashes_done = pdata[19] - first_nonce + throughput; @@ -111,7 +109,7 @@ void free_neoscrypt(int thr_id) cudaThreadSynchronize(); - neoscrypt_free_2stream(thr_id); + neoscrypt_free(thr_id); init[thr_id] = false; cudaDeviceSynchronize();