streebog: apply skunk improvements to veltor

7 years ago · 1e71dc5782
5 changed files with 33 additions and 17 deletions
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2.1], [], [ccminer], [http://github.com/tpruvot/ccminer])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/skunk/cuda_skunk_streebog.cu
+++ b/skunk/cuda_skunk_streebog.cu
@ -204,7 +204,7 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin
				@@ -204,7 +204,7 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin
 __constant__ uint64_t target64[4];

 __host__
-void skunk_set_target(uint32_t* ptarget)
+void skunk_streebog_set_target(uint32_t* ptarget)
 {
 	cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
--- a/skunk/skunk.cu
+++ b/skunk/skunk.cu
@ -23,12 +23,12 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
				@@ -23,12 +23,12 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_free(int thr_id);
-extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
-extern void streebog_set_target(const uint32_t* ptarget);
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);

 // krnlx merged kernel (for high-end cards only)
 extern void skunk_cpu_init(int thr_id, uint32_t threads);
-extern void skunk_set_target(uint32_t* ptarget);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
 extern void skunk_setBlock_80(int thr_id, void *pdata);
 extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
@ -117,10 +117,10 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -117,10 +117,10 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
 	if (use_compat_kernels[thr_id]) {
 		skein512_cpu_setBlock_80(endiandata);
-		streebog_set_target(ptarget);
+		streebog_sm3_set_target(ptarget);
 	} else {
 		skunk_setBlock_80(thr_id, endiandata);
-		skunk_set_target(ptarget);
+		skunk_streebog_set_target(ptarget);
 	}

 	do {
@ -129,7 +129,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -129,7 +129,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
 			skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 			x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 			x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-			streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
 		} else {
 			skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
 			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
--- a/x11/cuda_streebog.cu
+++ b/x11/cuda_streebog.cu
@ -806,10 +806,11 @@ void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
				@@ -806,10 +806,11 @@ void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 #define T6(x) shared[6][x]
 #define T7(x) shared[7][x]

+// Streebog final for Veltor and skunk on SM 3.x
 __constant__ uint64_t target64[4];

 __host__
-void streebog_set_target(const uint32_t* ptarget)
+void streebog_sm3_set_target(uint32_t* ptarget)
 {
 	cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice);
 }
@ -995,7 +996,7 @@ void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce)
				@@ -995,7 +996,7 @@ void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce)
 }

 __host__
-void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
+void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
 {
 	dim3 grid((threads + TPB-1) / TPB);
 	dim3 block(TPB);
--- a/x11/veltor.cu
+++ b/x11/veltor.cu
@ -10,11 +10,17 @@ extern "C" {
				@@ -10,11 +10,17 @@ extern "C" {
 #include "cuda_x11.h"

 extern void skein512_cpu_setBlock_80(void *pdata);
-extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
-extern void streebog_set_target(const uint32_t* ptarget);
+
+// for SM3.x
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+// for latest cards only
+extern void skunk_cpu_init(int thr_id, uint32_t threads);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
+extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);

 #include <stdio.h>
 #include <memory.h>
@ -23,7 +29,7 @@ extern void streebog_set_target(const uint32_t* ptarget);
				@@ -23,7 +29,7 @@ extern void streebog_set_target(const uint32_t* ptarget);
 static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNonce[MAX_GPUS];

-// veltorcoin CPU Hash
+// veltor CPU Hash
 extern "C" void veltorhash(void *output, const void *input)
 {
 	unsigned char _ALIGN(128) hash[128] = { 0 };
@ -53,6 +59,7 @@ extern "C" void veltorhash(void *output, const void *input)
				@@ -53,6 +59,7 @@ extern "C" void veltorhash(void *output, const void *input)
 }

 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };

 extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
@ -80,7 +87,9 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
				@@ -80,7 +87,9 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

-		quark_skein512_cpu_init(thr_id, throughput);
+		skunk_cpu_init(thr_id, throughput);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
 		x11_shavite512_cpu_init(thr_id, throughput);

 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
@ -97,14 +106,20 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
				@@ -97,14 +106,20 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 	skein512_cpu_setBlock_80(endiandata);

 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
-	streebog_set_target(ptarget);
+	if(use_compat_kernels[thr_id])
+		streebog_sm3_set_target(ptarget);
+	else
+		skunk_streebog_set_target(ptarget);

 	do {
 		int order = 0;
 		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		if(use_compat_kernels[thr_id])
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		else
+			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);

 		cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);