veltor update, 10x faster :p

From Alexis work, sib hash rate 200% also..
8 years ago · 36aedbb48e
5 changed files with 971 additions and 1071 deletions
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -246,16 +246,16 @@ Options:\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
 			s3          S3 (1Coin)\n\
 			vanilla     Blake256-8 (VNL)\n\
 			veltor      Thorsriddle streebog\n\
 			whirlcoin   Old Whirlcoin (Whirlpool algo)\n\
 			whirlpool   Whirlpool algo\n\
 			x11evo      Permuted x11 (Revolver)\n\
 			x11         X11 (DarkCoin)\n\
 			x13         X13 (MaruCoin)\n\
 			x14         X14\n\
 			x15         X15\n\
 			x17         X17\n\
 			vanilla     Blake256-8 (VNL)\n\
 			whirlcoin   Old Whirlcoin (Whirlpool algo)\n\
 			whirlpool   Whirlpool algo\n\
 			zr5         ZR5 (ZiftrCoin)\n\
  -d, --devices         Comma separated list of CUDA devices to use.\n\
                        Device IDs start counting from 0! Alternatively takes\n\
--- a/neoscrypt/cuda_vectors.h
+++ b/neoscrypt/cuda_vectors.h
@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
 // require a uint32_t[9] ret array
 // note: djm neoscrypt implementation is near the limits of gpu capabilities
 //       and weird behaviors can happen when tuning device functions code...
-__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 {
 	uint8_t *v = (uint8_t*) &vec4.s0;
 	uint8_t *r = (uint8_t*) ret;
@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 #else
 // same for SM 3.5+, really faster ?
-__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+__device__ static void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 {
 	uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
 	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
--- a/x11/cuda_streebog.cu
+++ b/x11/cuda_streebog.cu
--- a/x11/sib.cu
+++ b/x11/sib.cu
@ -17,7 +17,7 @@ extern "C" {
 #include "cuda_helper.h"
 #include "cuda_x11.h"
-extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 #include <stdio.h>
 #include <memory.h>
@ -104,9 +104,11 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 19 : 18;
+	const int dev_id = device_map[thr_id];
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
-	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+	if (device_sm[dev_id] >= 600) intensity = 20;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark)
 		ptarget[7] = 0xf;
@ -132,9 +134,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
-			return 0;
+			return -1;
 		}
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
 		cuda_check_cpu_init(thr_id, throughput);
@ -165,7 +167,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 		TRACE("jh512  :");
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("keccak :");
-		streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
 		TRACE("gost   :");
 		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
 		TRACE("luffa+c:");
@ -186,7 +188,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				// check if there was some other ones...
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash64);
 				*hashes_done = pdata[19] - first_nonce + throughput;
@ -200,7 +201,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 				}
 				pdata[19] = foundNonce;
 				return res;
-			} else {
+			} else if (vhash64[7] > Htarg && !opt_quiet) {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 				pdata[19] = foundNonce + 1;
 				continue;
--- a/x11/veltor.cu
+++ b/x11/veltor.cu
@ -9,29 +9,29 @@ extern "C" {
 #include "cuda_helper.h"
 #include "cuda_x11.h"
 extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
 extern void streebog_set_target(const uint32_t* ptarget);
 #include <stdio.h>
 #include <memory.h>
 #define NBN 2
 static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNonce[MAX_GPUS];
-// Veltor CPU Hash
+// veltorcoin CPU Hash
 extern "C" void veltorhash(void *output, const void *input)
 {
-	uint8_t _ALIGN(64) hash[128] = { 0 };
+	unsigned char _ALIGN(128) hash[128] = { 0 };
 	sph_skein512_context ctx_skein;
 	sph_shavite512_context ctx_shavite;
 	sph_shabal512_context ctx_shabal;
 	sph_gost512_context ctx_gost;
 	sph_shabal512_context ctx_shabal;
 	sph_shavite512_context ctx_shavite;
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, input, 80);
@ -52,19 +52,18 @@ extern "C" void veltorhash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 //#define _DEBUG
 #define _DEBUG_PREFIX "veltor"
 #include "cuda_debug.cuh"
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	int dev_id = device_map[thr_id];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 18;
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 20 : 18;
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 21;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark)
@ -79,58 +78,59 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
-
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
-	uint32_t endiandata[20];
+	uint32_t _ALIGN(64) h_resNonce[NBN];
 	uint32_t _ALIGN(64) endiandata[20];
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	skein512_cpu_setBlock_80(endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+
 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
 	streebog_set_target(ptarget);
 	do {
 		int order = 0;
 		uint32_t foundNonce;
 		// Hash with CUDA
 		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 		TRACE("blake  :");
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("shavite:");
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		TRACE("shabal :");
+		streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
-		streebog_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
+
-		TRACE("gost   :");
+		cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
 		*hashes_done = pdata[19] - first_nonce + throughput;
-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (h_resNonce[0] != UINT32_MAX)
 		if (foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
-			be32enc(&endiandata[19], foundNonce);
+			const uint32_t Htarg = ptarget[7];
-			veltorhash(vhash, endiandata);
+			const uint32_t startNounce = pdata[19];
-			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+			be32enc(&endiandata[19], startNounce + h_resNonce[0]);
 			veltorhash(vhash, endiandata);
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
 				int res = 1;
 				// check if there was another one...
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], res);
 				work_set_target_ratio(work, vhash);
-				if (secNonce != 0) {
+				work->nonces[0] = startNounce + h_resNonce[0];
 				if (h_resNonce[1] != UINT32_MAX)
 				{
 					uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
 					gpulog(LOG_DEBUG, thr_id, "Found 2nd nonce: %08x", secNonce);
 					be32enc(&endiandata[19], secNonce);
 					veltorhash(vhash, endiandata);
 					work->nonces[1] = secNonce;
 					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
 						work_set_target_ratio(work, vhash);
 						xchg(work->nonces[1], work->nonces[0]);
@ -139,24 +139,25 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 					}
 					res++;
 				}
-				pdata[19] = work->nonces[0] = foundNonce;
+				pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan
 				return res;
-			} else {
+			}
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			else if (vhash[7] > Htarg && !opt_quiet) {
-				pdata[19] = foundNonce + 1;
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[0]);
-				continue;
+				cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
 			}
 		}
 		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 	} while (!work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }