From cbede12c4701b98356e55bf32b75cb3bb82a9b24 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 19 Jul 2017 13:52:45 +0200
Subject: [PATCH] Add skunk algo, initial version

---
 Makefile.am              |   2 +-
 README.txt               |  13 ++-
 algos.h                  |   2 +
 bench.cpp                |   1 +
 ccminer.cpp              |   4 +
 ccminer.vcxproj          |   9 +-
 ccminer.vcxproj.filters  |   3 +
 compat/ccminer-config.h  |   2 +-
 configure.ac             |   2 +-
 miner.h                  |   3 +
 skunk.cu                 | 194 +++++++++++++++++++++++++++++++++++++++
 util.cpp                 |   3 +
 x13/cuda_x13_fugue512.cu |   2 +-
 13 files changed, 228 insertions(+), 12 deletions(-)
 create mode 100644 skunk.cu

diff --git a/Makefile.am b/Makefile.am
index a9fb339..febcc9b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -55,7 +55,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  quark/nist5.cu \
 			  quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \
 			  neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \
-			  pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \
+			  pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp skunk.cu zr5.cu \
 			  sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu \
 			  sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
diff --git a/README.txt b/README.txt
index 9e4a9c9..4967b5a 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccminer 2.1-dev (June 2017) "Equihash"
+ccminer 2.2-dev (July 2017) "Equihash, tribus and skunk"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -9,7 +9,6 @@ If you find this tool useful and like to support its continuous
 tpruvot@github:
   BTC  : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
   DCR  : DsUCcACGcyP8McNMRXQwbtpDxaVUYLDQDeU
-  LBC  : bKe6pLqELL3HHSbpJXxSdn5RrY2bfrkRhF
 
 Alexis:
   BTC  : 14EgXD7fPYD4sHBXWUi46VeiTVXNq765B8
@@ -51,6 +50,8 @@ Scrypt and Scrypt:N
 Scrypt-Jane (Chacha)
 Sibcoin (sib)
 Skein (Skein + SHA)
+Signatum (Skein cubehash fugue Streebog)
+Tribus (JH, keccak, simd)
 Woodcoin (Double Skein)
 Vanilla (Blake256 8-rounds - double sha256)
 Vertcoin Lyra2RE
@@ -116,6 +117,7 @@ its command line interface and options.
                           sib         use to mine Sibcoin
                           skein       use to mine Skeincoin
                           skein2      use to mine Woodcoin
+                          skunk       use to mine Signatum
                           timetravel  use to mine MachineCoin
                           tribus      use to mine Denarius
                           x11evo      use to mine Revolver
@@ -280,7 +282,12 @@ features.
 
 >>> RELEASE HISTORY <<<
 
-  v2.1 (unfinished)
+  v2.2 (under dev)
+                  New skunk algo, using the heavy streebog algorithm
+                  Enhance tribus algo (+10%)
+                  equihash protocol enhancement on yiimp.ccminer.org and zpool.ca
+
+  June 16th 2017  v2.1-tribus
                   Interface equihash algo with djeZo solver (from nheqminer 0.5c)
                   New api parameters (and multicast announces for local networks)
                   New tribus algo
diff --git a/algos.h b/algos.h
index 77625f2..b7dd0f2 100644
--- a/algos.h
+++ b/algos.h
@@ -45,6 +45,7 @@ enum sha_algos {
 	ALGO_SIB,
 	ALGO_SKEIN,
 	ALGO_SKEIN2,
+	ALGO_SKUNK,
 	ALGO_S3,
 	ALGO_TIMETRAVEL,
 	ALGO_TRIBUS,
@@ -109,6 +110,7 @@ static const char *algo_names[] = {
 	"sib",
 	"skein",
 	"skein2",
+	"skunk",
 	"s3",
 	"timetravel",
 	"tribus",
diff --git a/bench.cpp b/bench.cpp
index 71c0d70..500176f 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -80,6 +80,7 @@ void algo_free_all(int thr_id)
 	free_qubit(thr_id);
 	free_skeincoin(thr_id);
 	free_skein2(thr_id);
+	free_skunk(thr_id);
 	free_sha256d(thr_id);
 	free_sha256t(thr_id);
 	free_sia(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index e32cdbd..2f357cc 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -275,6 +275,7 @@ Options:\n\
 			scrypt-jane Scrypt-jane Chacha\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
+			skunk       Skein Cube Fugue Streebog\n\
 			s3          S3 (1Coin)\n\
 			timetravel  Machinecoin permuted x8\n\
 			tribus      Denerius\n\
@@ -2401,6 +2402,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_SKEIN2:
 			rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_SKUNK:
+			rc = scanhash_skunk(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_SHA256D:
 			rc = scanhash_sha256d(thr_id, &work, max_nonce, &hashes_done);
 			break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 552c595..a68f66b 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -532,6 +532,7 @@
     <CudaCompile Include="cuda_skeincoin.cu">
       <MaxRegCount>48</MaxRegCount>
     </CudaCompile>
+    <CudaCompile Include="skunk.cu" />
     <CudaCompile Include="tribus.cu" />
     <ClInclude Include="x11\cuda_x11_aes.cuh" />
     <CudaCompile Include="x11\cuda_x11_cubehash512.cu" />
@@ -563,10 +564,8 @@
     </CudaCompile>
     <CudaCompile Include="x13\cuda_x13_fugue512.cu">
     </CudaCompile>
-    <CudaCompile Include="x13\x13.cu">
-    </CudaCompile>
-    <CudaCompile Include="x15\x14.cu">
-    </CudaCompile>
+    <CudaCompile Include="x13\x13.cu" />
+    <CudaCompile Include="x15\x14.cu" />
     <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
     <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
     <CudaCompile Include="x17\hmq17.cu" />
@@ -600,4 +599,4 @@
   <Target Name="AfterClean">
     <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
   </Target>
-</Project>
+</Project>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index f28d993..f6fef2c 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -754,6 +754,9 @@
     <CudaCompile Include="pentablake.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
+    <CudaCompile Include="skunk.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
     <CudaCompile Include="tribus.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index dfd973c..d07e736 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.1"
+#define PACKAGE_VERSION "2.2"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/configure.ac b/configure.ac
index c92a6fc..c369201 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/miner.h b/miner.h
index f3406dd..b9c7256 100644
--- a/miner.h
+++ b/miner.h
@@ -308,6 +308,7 @@ extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsig
 extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_skunk(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -368,6 +369,7 @@ extern void free_sia(int thr_id);
 extern void free_sib(int thr_id);
 extern void free_skeincoin(int thr_id);
 extern void free_skein2(int thr_id);
+extern void free_skunk(int thr_id);
 extern void free_s3(int thr_id);
 extern void free_timetravel(int thr_id);
 extern void free_tribus(int thr_id);
@@ -909,6 +911,7 @@ void sha256t_hash(void *output, const void *input);
 void sibhash(void *output, const void *input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
+void skunk_hash(void *state, const void *input);
 void s3hash(void *output, const void *input);
 void timetravel_hash(void *output, const void *input);
 void bitcore_hash(void *output, const void *input);
diff --git a/skunk.cu b/skunk.cu
new file mode 100644
index 0000000..d7d9deb
--- /dev/null
+++ b/skunk.cu
@@ -0,0 +1,194 @@
+/**
+ * Skunk Algo for Signatum
+ * (skein, cube, fugue, gost streebog)
+ *
+ * tpruvot@github 06 2017 - GPLv3
+ */
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_streebog.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+
+extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads);
+extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+extern void streebog_set_target(const uint32_t* ptarget);
+
+#include <stdio.h>
+#include <memory.h>
+
+#define NBN 2
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+// CPU Hash
+extern "C" void skunk_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[128] = { 0 };
+
+	sph_skein512_context ctx_skein;
+	sph_cubehash512_context ctx_cubehash;
+	sph_fugue512_context ctx_fugue;
+	sph_gost512_context ctx_gost;
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*) hash);
+
+	sph_gost512_init(&ctx_gost);
+	sph_gost512(&ctx_gost, (const void*) hash, 64);
+	sph_gost512_close(&ctx_gost, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	int dev_id = device_map[thr_id];
+
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] > 500) ? 18 : 17;
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0xf;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_skein512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(64) h_resNonce[NBN];
+	uint32_t _ALIGN(64) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	skein512_cpu_setBlock_80(endiandata);
+
+	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+	streebog_set_target(ptarget);
+
+	do {
+		int order = 0;
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+
+		cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (h_resNonce[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+
+			be32enc(&endiandata[19], startNounce + h_resNonce[0]);
+			skunk_hash(vhash, endiandata);
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				work->nonces[0] = startNounce + h_resNonce[0];
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (h_resNonce[1] != UINT32_MAX)
+				{
+					uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
+					be32enc(&endiandata[19], secNonce);
+					skunk_hash(vhash, endiandata);
+					work->nonces[1] = secNonce;
+					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
+						work_set_target_ratio(work, vhash);
+						xchg(work->nonces[1], work->nonces[0]);
+					} else {
+						bn_set_target_ratio(work, vhash, work->valid_nonces);
+					}
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+				pdata[19] = startNounce + h_resNonce[0] + 1;
+				continue;
+			}
+		}
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_skunk(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	x13_fugue512_cpu_free(thr_id);
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/util.cpp b/util.cpp
index 99187fd..e601756 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2270,6 +2270,9 @@ void print_hash_tests(void)
 	skein2hash(&hash[0], &buf[0]);
 	printpfx("skein2", hash);
 
+	skunk_hash(&hash[0], &buf[0]);
+	printpfx("skunk", hash);
+
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);
 
diff --git a/x13/cuda_x13_fugue512.cu b/x13/cuda_x13_fugue512.cu
index ba1afd8..b69ddb6 100644
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@@ -254,7 +254,7 @@ void x13_fugue512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 	mixtabs[thr+256] = ROR8(tmp);
 	mixtabs[thr+512] = ROL16(tmp);
 	mixtabs[thr+768] = ROL8(tmp);
-#if TPB < 256
+#if TPB <= 256
 	if (blockDim.x < 256) {
 		const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
 		const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);