Add skunk algo, initial version

7 years ago · cbede12c47
13 changed files with 228 additions and 12 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -55,7 +55,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  quark/nist5.cu \
 			  quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \
 			  neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \
-			  pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \
+			  pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp skunk.cu zr5.cu \
 			  sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu \
 			  sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccminer 2.1-dev (June 2017) "Equihash"
+ccminer 2.2-dev (July 2017) "Equihash, tribus and skunk"
 ---------------------------------------------------------------
 ***************************************************************
@ -9,7 +9,6 @@ If you find this tool useful and like to support its continuous
 tpruvot@github:
  BTC  : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
  DCR  : DsUCcACGcyP8McNMRXQwbtpDxaVUYLDQDeU
  LBC  : bKe6pLqELL3HHSbpJXxSdn5RrY2bfrkRhF
 Alexis:
  BTC  : 14EgXD7fPYD4sHBXWUi46VeiTVXNq765B8
@ -51,6 +50,8 @@ Scrypt and Scrypt:N
 Scrypt-Jane (Chacha)
 Sibcoin (sib)
 Skein (Skein + SHA)
 Signatum (Skein cubehash fugue Streebog)
 Tribus (JH, keccak, simd)
 Woodcoin (Double Skein)
 Vanilla (Blake256 8-rounds - double sha256)
 Vertcoin Lyra2RE
@ -116,6 +117,7 @@ its command line interface and options.
                          sib         use to mine Sibcoin
                          skein       use to mine Skeincoin
                          skein2      use to mine Woodcoin
                          skunk       use to mine Signatum
                          timetravel  use to mine MachineCoin
                          tribus      use to mine Denarius
                          x11evo      use to mine Revolver
@ -280,7 +282,12 @@ features.
 >>> RELEASE HISTORY <<<
-  v2.1 (unfinished)
+  v2.2 (under dev)
                  New skunk algo, using the heavy streebog algorithm
                  Enhance tribus algo (+10%)
                  equihash protocol enhancement on yiimp.ccminer.org and zpool.ca
  June 16th 2017  v2.1-tribus
                  Interface equihash algo with djeZo solver (from nheqminer 0.5c)
                  New api parameters (and multicast announces for local networks)
                  New tribus algo
--- a/algos.h
+++ b/algos.h
@ -45,6 +45,7 @@ enum sha_algos {
 	ALGO_SIB,
 	ALGO_SKEIN,
 	ALGO_SKEIN2,
 	ALGO_SKUNK,
 	ALGO_S3,
 	ALGO_TIMETRAVEL,
 	ALGO_TRIBUS,
@ -109,6 +110,7 @@ static const char *algo_names[] = {
 	"sib",
 	"skein",
 	"skein2",
 	"skunk",
 	"s3",
 	"timetravel",
 	"tribus",
--- a/bench.cpp
+++ b/bench.cpp
@ -80,6 +80,7 @@ void algo_free_all(int thr_id)
 	free_qubit(thr_id);
 	free_skeincoin(thr_id);
 	free_skein2(thr_id);
 	free_skunk(thr_id);
 	free_sha256d(thr_id);
 	free_sha256t(thr_id);
 	free_sia(thr_id);
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -275,6 +275,7 @@ Options:\n\
 			scrypt-jane Scrypt-jane Chacha\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
 			skunk       Skein Cube Fugue Streebog\n\
 			s3          S3 (1Coin)\n\
 			timetravel  Machinecoin permuted x8\n\
 			tribus      Denerius\n\
@ -2401,6 +2402,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_SKEIN2:
 			rc = scanhash_skein2(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_SKUNK:
 			rc = scanhash_skunk(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_SHA256D:
 			rc = scanhash_sha256d(thr_id, &work, max_nonce, &hashes_done);
 			break;
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -532,6 +532,7 @@
    <CudaCompile Include="cuda_skeincoin.cu">
      <MaxRegCount>48</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="skunk.cu" />
    <CudaCompile Include="tribus.cu" />
    <ClInclude Include="x11\cuda_x11_aes.cuh" />
    <CudaCompile Include="x11\cuda_x11_cubehash512.cu" />
@ -563,10 +564,8 @@
    </CudaCompile>
    <CudaCompile Include="x13\cuda_x13_fugue512.cu">
    </CudaCompile>
-    <CudaCompile Include="x13\x13.cu">
+    <CudaCompile Include="x13\x13.cu" />
-    </CudaCompile>
+    <CudaCompile Include="x15\x14.cu" />
    <CudaCompile Include="x15\x14.cu">
    </CudaCompile>
    <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
    <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
    <CudaCompile Include="x17\hmq17.cu" />
@ -600,4 +599,4 @@
  <Target Name="AfterClean">
    <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
  </Target>
-</Project>
+</Project>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -754,6 +754,9 @@
    <CudaCompile Include="pentablake.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="skunk.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="tribus.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.1"
+#define PACKAGE_VERSION "2.2"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.1], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@ -308,6 +308,7 @@ extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsig
 extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skunk(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -368,6 +369,7 @@ extern void free_sia(int thr_id);
 extern void free_sib(int thr_id);
 extern void free_skeincoin(int thr_id);
 extern void free_skein2(int thr_id);
 extern void free_skunk(int thr_id);
 extern void free_s3(int thr_id);
 extern void free_timetravel(int thr_id);
 extern void free_tribus(int thr_id);
@ -909,6 +911,7 @@ void sha256t_hash(void *output, const void *input);
 void sibhash(void *output, const void *input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
 void skunk_hash(void *state, const void *input);
 void s3hash(void *output, const void *input);
 void timetravel_hash(void *output, const void *input);
 void bitcore_hash(void *output, const void *input);
--- a/skunk.cu
+++ b/skunk.cu
@ -0,0 +1,194 @@
 /**
 * Skunk Algo for Signatum
 * (skein, cube, fugue, gost streebog)
 *
 * tpruvot@github 06 2017 - GPLv3
 */
 extern "C" {
 #include "sph/sph_skein.h"
 #include "sph/sph_cubehash.h"
 #include "sph/sph_fugue.h"
 #include "sph/sph_streebog.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_free(int thr_id);
 extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
 extern void streebog_set_target(const uint32_t* ptarget);
 #include <stdio.h>
 #include <memory.h>
 #define NBN 2
 static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNonce[MAX_GPUS];
 // CPU Hash
 extern "C" void skunk_hash(void *output, const void *input)
 {
 	unsigned char _ALIGN(128) hash[128] = { 0 };
 	sph_skein512_context ctx_skein;
 	sph_cubehash512_context ctx_cubehash;
 	sph_fugue512_context ctx_fugue;
 	sph_gost512_context ctx_gost;
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, input, 80);
 	sph_skein512_close(&ctx_skein, (void*) hash);
 	sph_cubehash512_init(&ctx_cubehash);
 	sph_cubehash512(&ctx_cubehash, (const void*) hash, 64);
 	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
 	sph_fugue512_init(&ctx_fugue);
 	sph_fugue512(&ctx_fugue, (const void*) hash, 64);
 	sph_fugue512_close(&ctx_fugue, (void*) hash);
 	sph_gost512_init(&ctx_gost);
 	sph_gost512(&ctx_gost, (const void*) hash, 64);
 	sph_gost512_close(&ctx_gost, (void*) hash);
 	memcpy(output, hash, 32);
 }
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	int dev_id = device_map[thr_id];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	int intensity = (device_sm[device_map[thr_id]] > 500) ? 18 : 17;
 	if (strstr(device_name[dev_id], "GTX 10")) intensity = 19;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark)
 		ptarget[7] = 0xf;
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
 		init[thr_id] = true;
 	}
 	uint32_t _ALIGN(64) h_resNonce[NBN];
 	uint32_t _ALIGN(64) endiandata[20];
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	skein512_cpu_setBlock_80(endiandata);
 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
 	streebog_set_target(ptarget);
 	do {
 		int order = 0;
 		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
 		cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
 		*hashes_done = pdata[19] - first_nonce + throughput;
 		if (h_resNonce[0] != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash[8];
 			const uint32_t Htarg = ptarget[7];
 			const uint32_t startNounce = pdata[19];
 			be32enc(&endiandata[19], startNounce + h_resNonce[0]);
 			skunk_hash(vhash, endiandata);
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
 				work->nonces[0] = startNounce + h_resNonce[0];
 				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
 				if (h_resNonce[1] != UINT32_MAX)
 				{
 					uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
 					be32enc(&endiandata[19], secNonce);
 					skunk_hash(vhash, endiandata);
 					work->nonces[1] = secNonce;
 					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
 						work_set_target_ratio(work, vhash);
 						xchg(work->nonces[1], work->nonces[0]);
 					} else {
 						bn_set_target_ratio(work, vhash, work->valid_nonces);
 					}
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
 				} else {
 					pdata[19] = work->nonces[0] + 1; // cursor
 				}
 				return work->valid_nonces;
 			}
 			else if (vhash[7] > Htarg) {
 				gpu_increment_reject(thr_id);
 				cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
 				pdata[19] = startNounce + h_resNonce[0] + 1;
 				continue;
 			}
 		}
 		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 	} while (!work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 // cleanup
 extern "C" void free_skunk(int thr_id)
 {
 	if (!init[thr_id])
 		return;
 	cudaThreadSynchronize();
 	x13_fugue512_cpu_free(thr_id);
 	cudaFree(d_hash[thr_id]);
 	cudaFree(d_resNonce[thr_id]);
 	init[thr_id] = false;
 	cudaDeviceSynchronize();
 }
--- a/util.cpp
+++ b/util.cpp
@ -2270,6 +2270,9 @@ void print_hash_tests(void)
 	skein2hash(&hash[0], &buf[0]);
 	printpfx("skein2", hash);
 	skunk_hash(&hash[0], &buf[0]);
 	printpfx("skunk", hash);
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@ -254,7 +254,7 @@ void x13_fugue512_gpu_hash_64(uint32_t threads, uint64_t *g_hash)
 	mixtabs[thr+256] = ROR8(tmp);
 	mixtabs[thr+512] = ROL16(tmp);
 	mixtabs[thr+768] = ROL8(tmp);
-#if TPB < 256
+#if TPB <= 256
 	if (blockDim.x < 256) {
 		const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
 		const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);