lbry maxwell and pascal update (up to 10% on pascal)

Based on alexis78 work and sponsored by LBRY.IO team (thanks) Release 1.8.2, use cuda 8 for x86
8 years ago · 9f2ed5135b
9 changed files with 1588 additions and 460 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -52,7 +52,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
 			  sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
 			  sph/ripemd.c sph/sph_sha2.c \
-			  lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \
+			  lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner 1.8.2 (August 2016) "Veltor algo Thor's Riddle streebog"
+ccMiner 1.8.2 (Sept 2016) "Veltor and lbry boost"
 ---------------------------------------------------------------
 ***************************************************************
@ -245,9 +245,10 @@ features.
 >>> RELEASE HISTORY <<<
-  Aug. 20th 2016  v1.8.2
+  Sep. 22th 2016  v1.8.2
                  lbry improvements by Alexis Provos
                  Prevent Windows hibernate while mining
-                  veltor algo
+                  veltor algo (basic implementation)
  Aug. 10th 2016  v1.8.1
                  SIA Blake2-B Algo (getwork over stratum for Suprnova)
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -2430,7 +2430,7 @@ static void *miner_thread(void *userdata)
 				pthread_mutex_lock(&stats_lock);
 				thr_hashrates[thr_id] = hashes_done / dtime;
 				thr_hashrates[thr_id] *= rate_factor;
-				if (loopcnt > 1) // ignore first (init time)
+				if (loopcnt > 2) // ignore first (init time)
 					stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
 				pthread_mutex_unlock(&stats_lock);
 			}
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -39,7 +39,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
  </ImportGroup>
  <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
@ -429,6 +429,7 @@
    <CudaCompile Include="Algo256\cuda_skein256.cu" />
    <CudaCompile Include="lbry\cuda_sha256_lbry.cu" />
    <CudaCompile Include="lbry\cuda_sha512_lbry.cu" />
    <CudaCompile Include="lbry\cuda_lbry_merged.cu" />
    <CudaCompile Include="lbry\lbry.cu" />
    <CudaCompile Include="pentablake.cu">
      <MaxRegCount>80</MaxRegCount>
@ -534,7 +535,7 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
  </ImportGroup>
  <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -481,7 +481,7 @@ static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(
 static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
 static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) {
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000
 	uint2 result;
 	asm("{ // uint2 a+b \n\t"
 		"add.cc.u32 %0, %2, %4; \n\t"
--- a/lbry/cuda_lbry_merged.cu
+++ b/lbry/cuda_lbry_merged.cu
--- a/lbry/cuda_sha256_lbry.cu
+++ b/lbry/cuda_sha256_lbry.cu
--- a/lbry/cuda_sha512_lbry.cu
+++ b/lbry/cuda_sha512_lbry.cu
@ -1,20 +1,16 @@
 /**
 * sha-512 CUDA implementation.
- * Tanguy Pruvot and Provos Alexis - JUL 2016
+ * Tanguy Pruvot and Provos Alexis - Jul / Sep 2016
 * Sponsored by LBRY.IO team
 */
 //#define USE_ROT_ASM_OPT 0
 #include <cuda_helper.h>
 #include <cuda_vector_uint2x4.h>
-#include "miner.h"
+
-
+#include <miner.h>
-static __constant__
+
-#if __CUDA_ARCH__ > 500
+static __constant__ _ALIGN(8) uint64_t K_512[80] = {
 _ALIGN(16)
 #else
 _ALIGN(8)
 #endif
 uint64_t K_512[80] = {
 	0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
 	0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
 	0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
@ -58,12 +54,12 @@ uint64_t ROR64_8(const uint64_t u64) {
 #define andor64(a,b,c) ((a & (b | c)) | (b & c))
 #define xandx64(e,f,g) (g ^ (e & (g ^ f)))
-static __device__ __forceinline__
+__device__ __forceinline__
-void sha512_step2(uint64_t* r, const uint64_t W, const uint64_t K, const int ord)
+static void sha512_step2(uint64_t *const r,const uint64_t W,const uint64_t K, const int ord)
 {
 	const uint64_t T1 = r[(15-ord) & 7] + K + W + bsg5_1(r[(12-ord) & 7]) + xandx64(r[(12-ord) & 7],r[(13-ord) & 7],r[(14-ord) & 7]);
 	r[(15-ord) & 7] = andor64(r[(8-ord) & 7],r[(9-ord) & 7],r[(10-ord) & 7]) + bsg5_0(r[(8-ord) & 7]) + T1;
-	r[(11-ord) & 7] += T1;
+	r[(11-ord) & 7]+= T1;
 }
 /**************************************************************************************************/
@ -76,7 +72,6 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
 		0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
 		0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 	};
 	uint64_t r[8];
 	uint64_t W[16];
 	if (thread < threads)
@ -91,7 +86,7 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
 		W[4] = 0x8000000000000000; // end tag
 		#pragma unroll
-		for (int i = 5; i < 15; i++) W[i] = 0;
+		for (uint32_t i = 5; i < 15; i++) W[i] = 0;
 		W[15] = 0x100; // 256 bits
@ -100,31 +95,32 @@ void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
 			sha512_step2(r, W[i], K_512[i], i&7);
 		}
 		#pragma unroll 5
 		for (uint32_t i = 16; i < 80; i+=16){
 			#pragma unroll
-		for (int i = 16; i < 80; i+=16) {
+			for (uint32_t j = 0; j<16; j++){
 			#pragma unroll
 			for (int j = 0; j<16; j++) {
 				W[(i + j) & 15] += W[((i + j) - 7) & 15] + ssg5_0(W[((i + j) - 15) & 15]) + ssg5_1(W[((i + j) - 2) & 15]);
 			}
 			#pragma unroll
-			for (int j = 0; j<16; j++) {
+			for (uint32_t j = 0; j<16; j++){
 				sha512_step2(r, W[j], K_512[i+j], (i+j)&7);
 			}
 		}
 		#pragma unroll 8
-		for (int i = 0; i < 8; i++)
+		for (uint32_t i = 0; i < 8; i++)
 			r[i] = cuda_swab64(r[i] + IV512[i]);
 		*(uint2x4*)&pHash[0] = *(uint2x4*)&r[0];
 		*(uint2x4*)&pHash[4] = *(uint2x4*)&r[4];
 	}
 }
 __host__
 void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash)
 {
-	const uint32_t threadsperblock = 512;
+	const uint32_t threadsperblock = 256;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
--- a/lbry/lbry.cu
+++ b/lbry/lbry.cu
@ -1,8 +1,9 @@
 /**
- * Lbry CUDA Implementation
+ * Lbry Algo (sha-256 / sha-512 / ripemd)
 *
- * by tpruvot@github - July 2016
+ * tpruvot and Provos Alexis - Jul / Sep 2016
 *
 * Sponsored by LBRY.IO team
 */
 #include <string.h>
@ -64,11 +65,14 @@ extern "C" void lbry_hash(void* output, const void* input)
 extern void lbry_sha256_init(int thr_id);
 extern void lbry_sha256_free(int thr_id);
-extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget);
+extern void lbry_sha256_setBlock_112(uint32_t *pdata);
 extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash);
 extern void lbry_sha512_init(int thr_id);
 extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash);
-extern int lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce);
+extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t *d_inputHash, uint32_t *d_resNonce, const uint64_t target64);
 extern void lbry_sha256_setBlock_112_merged(uint32_t *pdata);
 extern void lbry_merged(int thr_id,uint32_t startNonce, uint32_t threads, uint32_t *d_resNonce, const uint64_t target64);
 static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
 	return iftrue ? swab32(val) : val;
@ -97,7 +101,7 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
 	if (device_sm[dev_id] < 350) intensity = 18;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
-	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark) {
 		ptarget[7] = 0xf;
@ -107,17 +111,16 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
 		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
-			// reduce cpu usage (linux)
+			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 			CUDA_LOG_ERROR();
 		}
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+		if(device_sm[dev_id] <= 500)
-		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
+			CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 8 * sizeof(uint64_t) * throughput));
-		lbry_sha256_init(thr_id);
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
 		lbry_sha512_init(thr_id);
 		CUDA_LOG_ERROR();
 		init[thr_id] = true;
@ -127,29 +130,29 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
 		be32enc(&endiandata[i], pdata[i]);
 	}
-	lbry_sha256_setBlock_112(endiandata, ptarget);
+	if(device_sm[dev_id] <= 500)
 		lbry_sha256_setBlock_112(endiandata);
 	else
 		lbry_sha256_setBlock_112_merged(endiandata);
 	cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
 	do {
 		// Hash with CUDA
 		if(device_sm[dev_id] <= 500){
 			lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
 			lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id]);
-
+			lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
-		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
+		}else{
-		int err = lbry_sha256d_hash_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+			lbry_merged(thr_id,pdata[LBC_NONCE_OFT32], throughput, d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
 		if (err) {
 			// reinit
 			free_lbry(thr_id);
 			return -1;
 		}
-
+		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
 		cudaMemcpy(resNonces, d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 		*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
 		if (resNonces[0] != UINT32_MAX)
 		{
 			const uint32_t startNonce = pdata[LBC_NONCE_OFT32];
 			resNonces[0] += startNonce;
 			endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[0], !swap);
@ -157,12 +160,13 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
 				int res = 1;
 				*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
 				work->nonces[0] = swab32_if(resNonces[0], swap);
 				work_set_target_ratio(work, vhash);
 				if (resNonces[1] != UINT32_MAX) {
 					resNonces[1] += startNonce;
 					if (opt_debug)
-						gpulog(LOG_BLUE, thr_id, "found second nonce %08x", resNonces[1]);
+						gpulog(LOG_BLUE, thr_id, "Found second nonce %08x", swab32(resNonces[1]));
 					endiandata[LBC_NONCE_OFT32] = swab32_if(resNonces[1], !swap);
 					lbry_hash(vhash, endiandata);
 					work->nonces[1] = swab32_if(resNonces[1], swap);
@ -189,22 +193,23 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce,
 	} while (!work_restart[thr_id].restart);
-	*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + 1;
+	*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
 	return 0;
 }
 // cleanup
-extern "C" void free_lbry(int thr_id)
+void free_lbry(int thr_id)
 {
 	if (!init[thr_id])
 		return;
 	cudaThreadSynchronize();
 	if(device_sm[device_map[thr_id]]<=500)
 		cudaFree(d_hash[thr_id]);
 	cudaFree(d_resNonce[thr_id]);
 	lbry_sha256_free(thr_id);
 	init[thr_id] = false;