From d58d53f2b2a8e6167649b60e74d8b48a879984f2 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 14 Apr 2015 21:57:01 +0200
Subject: [PATCH] update README, small changes, prepare release 1.6.1

still need a SM 3.0 fix for skein...
---
 Makefile.am             |  7 ++--
 README.md               |  6 ++--
 README.txt              | 24 +++++++------
 ccminer.cpp             |  9 +++--
 configure.ac            |  2 +-
 miner.h                 |  1 -
 quark/cuda_bmw512_30.cu |  3 +-
 skein.cu                | 74 ++++-------------------------------------
 skein2.cu               | 59 +++++++++++++-------------------
 util.cpp                |  3 --
 x11/x11.cu              |  4 +--
 11 files changed, 63 insertions(+), 129 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index 63cfb32..e8a35ca 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -7,8 +7,8 @@ else
 JANSSON_INCLUDES=
 endif
 
-EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
-			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
+EXTRA_DIST		= autogen.sh README.md README.txt LICENSE.txt \
+			  ccminer.sln ccminer.vcxproj ccminer.vcxproj.filters \
 			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
 
 SUBDIRS = compat
@@ -121,3 +121,6 @@ JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
 skein.o: skein.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 
+skein2.o: skein2.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
+
diff --git a/README.md b/README.md
index 073a7c9..a863fec 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ ccminer
 
 Based on Christian Buchner's &amp; Christian H.'s CUDA project, no more active on github recently.
 
-Fork by tpruvot@github with X14,X15,X17,WHIRL,Blake256,BlakeCoin,Lyra2RE and some others, check the [README.txt](README.txt)
+Fork by tpruvot@github with X14,X15,X17,Blake256,BlakeCoin,Lyra2RE,Skein,ZR5 and others, check the [README.txt](README.txt)
 
    BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
    [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer)
@@ -30,5 +30,7 @@ The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64
 To rebuild them, you need to clone this repository and its submodules :
     git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows
 
-There is also a [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website.
+On Linux, you can use the helper ./build.sh (edit it if required)
+
+There is also an old [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website.
 
diff --git a/README.txt b/README.txt
index f4b27d3..f169ee4 100644
--- a/README.txt
+++ b/README.txt
@@ -1,16 +1,15 @@
 
-ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX"
+ccMiner release 1.6.1-tpruvot (Apr 2015) - "Skein512 Chainsaw"
 ---------------------------------------------------------------
 
 ***************************************************************
-If you find this tool useful and like to support its continued 
+If you find this tool useful and like to support its continuous
           development, then consider a donation.
 
 tpruvot@github:
-  BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
+  BTC  : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
   DRK  : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3
-  NEOS : NaEcVrdzoCWHUYXb7X8QoafoKS9UV69Yk4
-  XST  : S9TqZucWgT6ajZLDBxQnHUtmkotCEHn9z9
+  ZRC  : ZEcubH2xp2mpuwxMjy7wZThr5AzLGu3mqT
 
 DJM34:
   BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze
@@ -56,7 +55,7 @@ that the most of our comments are in german.
 
 >>> Command Line Interface <<<
 
-This code is based on the pooler cpuminer 2.3.2 release and inherits
+This code is based on the pooler cpuminer and inherits
 its command line interface and options.
 
   -a, --algo=ALGO       specify the algorithm to use
@@ -83,7 +82,6 @@ its command line interface and options.
                           s3          use to mine 1coin
                           skein       use to mine Skeincoin
                           skein2      use to mine Woodcoin
-                          whirl       use to mine Whirlcoin
                           whirlpoolx  use to mine Vanillacoin
                           x11         use to mine DarkCoin
                           x14         use to mine X14Coin
@@ -113,20 +111,23 @@ its command line interface and options.
   -T, --timeout=N       network timeout, in seconds (default: 270)
   -s, --scantime=N      upper bound on time spent scanning current work when
                         long polling is unavailable, in seconds (default: 5)
+  -n, --ndevs           list cuda devices
   -N, --statsavg        number of samples used to display hashrate (default: 30)
       --no-gbt          disable getblocktemplate support (height check in solo)
       --no-longpoll     disable X-Long-Polling support
       --no-stratum      disable X-Stratum support
   -q, --quiet           disable per-thread hashmeter output
+      --no-color        disable colored output
   -D, --debug           enable debug output
   -P, --protocol-dump   verbose dump of protocol-level activities
   -b, --api-bind        IP/Port for the miner API (default: 127.0.0.1:4068)
+      --api-remote      Allow remote control
+  -B, --background      run the miner in the background
       --benchmark       run in offline benchmark mode
       --cputest         debug hashes from cpu algorithms
       --cpu-affinity    set process affinity to specific cpu core(s) mask
       --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)
   -c, --config=FILE     load a JSON-format configuration file
-      --no-color        disable colored console output
   -V, --version         display version information and exit
   -h, --help            display this help text and exit
 
@@ -189,6 +190,9 @@ features.
 
 >>> RELEASE HISTORY <<<
 
+  Apr. 14th 2015  v1.6.1
+                  Add the Double Skein Algo for Woodcoin
+
   Mar. 27th 2015  v1.6.0
                   Add the ZR5 Algo for Ziftcoin
                   Implement Skeincoin algo (skein + sha)
@@ -347,8 +351,8 @@ Christian Buchner, Christian H. (Germany): Initial CUDA implementation
 
 djm34, tsiv, sp and klausT for cuda algos implementation and optimisation
 
-Tanguy Pruvot : 750Ti tuning, blake, colors, general code cleanup/opts
-                API monitoring, linux Config/Makefile and vstudio stuff...
+Tanguy Pruvot : 750Ti tuning, blake, colors, zr5, skein, general code cleanup
+                API monitoring, linux Config/Makefile and vstudio libs...
 
 and also many thanks to anyone else who contributed to the original
 cpuminer application (Jeff Garzik, pooler), it's original HVC-fork
diff --git a/ccminer.cpp b/ccminer.cpp
index a1ab2f4..def1420 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -263,7 +263,6 @@ Options:\n\
 			x14         X14\n\
 			x15         X15\n\
 			x17         X17 (peoplecurrency)\n\
-			whirl       Whirlcoin (old whirlpool)\n\
 			whirlpoolx  Vanilla coin\n\
 			zr5         ZR5 (ZiftrCoin)\n\
   -d, --devices         Comma separated list of CUDA devices to use.\n\
@@ -1396,7 +1395,7 @@ static void *miner_thread(void *userdata)
 			max64 = max(minmax-1, max64);
 		}
 
-		// we can't scan more than uint capacity
+		// we can't scan more than uint32 capacity
 		max64 = min(UINT32_MAX, max64);
 
 		start_nonce = nonceptr[0];
@@ -1412,6 +1411,11 @@ static void *miner_thread(void *userdata)
 
 		// todo: keep it rounded for gpu threads ?
 
+		if (unlikely(start_nonce > max_nonce)) {
+			// should not happen but seen in skein2 benchmark with 2 gpus
+			max_nonce = end_nonce = UINT32_MAX;
+		}
+
 		work.scanned_from = start_nonce;
 		nonceptr[0] = start_nonce;
 
@@ -1537,6 +1541,7 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 
+		/* to be deleted */
 		case ALGO_WHIRLCOIN:
 			rc = scanhash_whc(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
diff --git a/configure.ac b/configure.ac
index 45b5254..61b47e7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.6.1-git])
+AC_INIT([ccminer], [1.6.1])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/miner.h b/miner.h
index 65ee5ae..71bf153 100644
--- a/miner.h
+++ b/miner.h
@@ -686,7 +686,6 @@ void qubithash(void *state, const void *input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
 void s3hash(void *output, const void *input);
-void wcoinhash(void *state, const void *input);
 void whirlxHash(void *state, const void *input);
 void x11hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
diff --git a/quark/cuda_bmw512_30.cu b/quark/cuda_bmw512_30.cu
index 358a3b3..8b2e858 100644
--- a/quark/cuda_bmw512_30.cu
+++ b/quark/cuda_bmw512_30.cu
@@ -12,8 +12,7 @@
 	q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
 	q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
 
-static __constant__ uint64_t d_constMem[16];
-static uint64_t h_constMem[16] = {
+static __constant__ uint64_t d_constMem[16] = {
 	SPH_C64(0x8081828384858687),
 	SPH_C64(0x88898A8B8C8D8E8F),
 	SPH_C64(0x9091929394959697),
diff --git a/skein.cu b/skein.cu
index 6305ad5..76f8da9 100644
--- a/skein.cu
+++ b/skein.cu
@@ -3,9 +3,7 @@
  * by tpruvot@github - 2015
  */
 
-extern "C" {
 #include "sph/sph_skein.h"
-}
 
 #include "miner.h"
 #include "cuda_helper.h"
@@ -101,70 +99,10 @@ void sha256_transform_gpu(uint32_t *state, uint32_t *message)
 	}
 
 	/* 3. Mix. */
-	RNDr(S, W, 0);
-	RNDr(S, W, 1);
-	RNDr(S, W, 2);
-	RNDr(S, W, 3);
-	RNDr(S, W, 4);
-	RNDr(S, W, 5);
-	RNDr(S, W, 6);
-	RNDr(S, W, 7);
-	RNDr(S, W, 8);
-	RNDr(S, W, 9);
-	RNDr(S, W, 10);
-	RNDr(S, W, 11);
-	RNDr(S, W, 12);
-	RNDr(S, W, 13);
-	RNDr(S, W, 14);
-	RNDr(S, W, 15);
-	RNDr(S, W, 16);
-	RNDr(S, W, 17);
-	RNDr(S, W, 18);
-	RNDr(S, W, 19);
-	RNDr(S, W, 20);
-	RNDr(S, W, 21);
-	RNDr(S, W, 22);
-	RNDr(S, W, 23);
-	RNDr(S, W, 24);
-	RNDr(S, W, 25);
-	RNDr(S, W, 26);
-	RNDr(S, W, 27);
-	RNDr(S, W, 28);
-	RNDr(S, W, 29);
-	RNDr(S, W, 30);
-	RNDr(S, W, 31);
-	RNDr(S, W, 32);
-	RNDr(S, W, 33);
-	RNDr(S, W, 34);
-	RNDr(S, W, 35);
-	RNDr(S, W, 36);
-	RNDr(S, W, 37);
-	RNDr(S, W, 38);
-	RNDr(S, W, 39);
-	RNDr(S, W, 40);
-	RNDr(S, W, 41);
-	RNDr(S, W, 42);
-	RNDr(S, W, 43);
-	RNDr(S, W, 44);
-	RNDr(S, W, 45);
-	RNDr(S, W, 46);
-	RNDr(S, W, 47);
-	RNDr(S, W, 48);
-	RNDr(S, W, 49);
-	RNDr(S, W, 50);
-	RNDr(S, W, 51);
-	RNDr(S, W, 52);
-	RNDr(S, W, 53);
-	RNDr(S, W, 54);
-	RNDr(S, W, 55);
-	RNDr(S, W, 56);
-	RNDr(S, W, 57);
-	RNDr(S, W, 58);
-	RNDr(S, W, 59);
-	RNDr(S, W, 60);
-	RNDr(S, W, 61);
-	RNDr(S, W, 62);
-	RNDr(S, W, 63);
+	#pragma unroll
+	for (int i = 0; i < 64; i++) {
+		RNDr(S, W, i);
+	}
 
 	for (int i = 0; i < 8; i++)
 		state[i] += S[i];
@@ -408,8 +346,8 @@ extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata,
 	const uint32_t first_nonce = pdata[19];
 	const int swap = 1;
 
-	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
-	throughput = min(throughput,  (max_nonce - first_nonce));
+	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
+	throughput = min(throughput, (max_nonce - first_nonce));
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x07;
diff --git a/skein2.cu b/skein2.cu
index 82592de..17fd51e 100644
--- a/skein2.cu
+++ b/skein2.cu
@@ -1,6 +1,9 @@
-extern "C" {
+/**
+ * SKEIN512 80 + SKEIN512 64 (Woodcoin)
+ * by tpruvot@github - 2015
+ */
+
 #include "sph/sph_skein.h"
-}
 
 #include "miner.h"
 #include "cuda_helper.h"
@@ -8,21 +11,19 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];
 
 extern void skein512_cpu_setBlock_80(void *pdata);
-extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 
 extern "C" void skein2hash(void *output, const void *input)
 {
+	uint32_t _ALIGN(64) hash[16];
 	sph_skein512_context ctx_skein;
 
-	uint32_t hash[16];
-
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, input, 80);
 	sph_skein512_close(&ctx_skein, hash);
-	//applog_hash((uchar*)hash);
-	//applog_hash((uchar*)&hash[8]);
+
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, hash, 64);
 	sph_skein512_close(&ctx_skein, hash);
@@ -30,21 +31,6 @@ extern "C" void skein2hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-#ifdef _DEBUG
-#define TRACE(algo) { \
-	if (max_nonce == 1 && pdata[19] <= 1) { \
-		uint32_t* debugbuf = NULL; \
-		cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \
-		cudaMemcpy(debugbuf, d_hash[thr_id], 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
-		printf("SK2 %s %08x %08x %08x %08x...\n", algo, \
-			swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3])); \
-		cudaFree(debugbuf); \
-		} \
-}
-#else
-#define TRACE(algo) {}
-#endif
-
 static bool init[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
@@ -53,25 +39,25 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 
-	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
-	throughput = min(throughput,  (max_nonce - first_nonce));
+	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
+	throughput = min(throughput, (max_nonce - first_nonce));
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0FFF;
+		((uint32_t*)ptarget)[7] = 0;
 
 	if (!init[thr_id])
 	{
 		cudaDeviceReset();
 		cudaSetDevice(device_map[thr_id]);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64UL * throughput));
 
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 
 	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
+	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
 	skein512_cpu_setBlock_80((void*)endiandata);
@@ -81,24 +67,22 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 		int order = 0;
 
 		// Hash with CUDA
-		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		TRACE("80:");
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 0);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		TRACE("64:");
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
 
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash64[8];
 
 			endiandata[19] = foundNonce;
 			skein2hash(vhash64, endiandata);
 
-			#define Htarg ptarget[7]
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				*hashes_done = pdata[19] - first_nonce + throughput;
 				if (secNonce != 0) {
 					if (!opt_quiet)
 						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", device_map[thr_id], swab32(secNonce));
@@ -113,10 +97,13 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 			}
 		}
 
+		if (((uint64_t) throughput + pdata[19]) > max_nonce) {
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
diff --git a/util.cpp b/util.cpp
index 0b9e012..366cc3e 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1797,9 +1797,6 @@ void print_hash_tests(void)
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);
 
-	wcoinhash(&hash[0], &buf[0]);
-	printpfx("whirl", hash);
-
 	whirlxHash(&hash[0], &buf[0]);
 	printpfx("whirlpoolx", hash);
 
diff --git a/x11/x11.cu b/x11/x11.cu
index 6be4c4d..d5802df 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -131,7 +131,7 @@ extern "C" void x11hash(void *output, const void *input)
 		uint32_t* debugbuf = NULL; \
 		cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
 		cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
-		printf("%s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+		printf("X11 %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
 			swab32(debugbuf[2]), swab32(debugbuf[3])); \
 		cudaFreeHost(debugbuf); \
 	} \
@@ -209,7 +209,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("simd   :");
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		TRACE("echo X11 =>");
+		TRACE("echo => ");
 
 		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)