update README, small changes, prepare release 1.6.1

still need a SM 3.0 fix for skein...
2015-04-14 21:57:01 +02:00 · 2015-04-14 21:57:01 +02:00 · d58d53f2b2
commit d58d53f2b2
parent 7963dbb438
11 changed files with 63 additions and 129 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -7,8 +7,8 @@ else
 JANSSON_INCLUDES=
 endif

-EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
-			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
+EXTRA_DIST		= autogen.sh README.md README.txt LICENSE.txt \
+			  ccminer.sln ccminer.vcxproj ccminer.vcxproj.filters \
 			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in

 SUBDIRS = compat
@ -121,3 +121,6 @@ JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
 skein.o: skein.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<

+skein2.o: skein2.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
+
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@ ccminer

 Based on Christian Buchner's &amp; Christian H.'s CUDA project, no more active on github recently.

-Fork by tpruvot@github with X14,X15,X17,WHIRL,Blake256,BlakeCoin,Lyra2RE and some others, check the [README.txt](README.txt)
+Fork by tpruvot@github with X14,X15,X17,Blake256,BlakeCoin,Lyra2RE,Skein,ZR5 and others, check the [README.txt](README.txt)

   BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
   [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer)
@ -30,5 +30,7 @@ The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64
 To rebuild them, you need to clone this repository and its submodules :
    git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows

-There is also a [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website.
+On Linux, you can use the helper ./build.sh (edit it if required)
+
+There is also an old [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website.

--- a/README.txt
+++ b/README.txt
@ -1,16 +1,15 @@

-ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX"
+ccMiner release 1.6.1-tpruvot (Apr 2015) - "Skein512 Chainsaw"
 ---------------------------------------------------------------

 ***************************************************************
-If you find this tool useful and like to support its continued 
+If you find this tool useful and like to support its continuous
          development, then consider a donation.

 tpruvot@github:
-  BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
+  BTC  : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
  DRK  : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3
-  NEOS : NaEcVrdzoCWHUYXb7X8QoafoKS9UV69Yk4
-  XST  : S9TqZucWgT6ajZLDBxQnHUtmkotCEHn9z9
+  ZRC  : ZEcubH2xp2mpuwxMjy7wZThr5AzLGu3mqT

 DJM34:
  BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze
@ -56,7 +55,7 @@ that the most of our comments are in german.

 >>> Command Line Interface <<<

-This code is based on the pooler cpuminer 2.3.2 release and inherits
+This code is based on the pooler cpuminer and inherits
 its command line interface and options.

  -a, --algo=ALGO       specify the algorithm to use
@ -83,7 +82,6 @@ its command line interface and options.
                          s3          use to mine 1coin
                          skein       use to mine Skeincoin
                          skein2      use to mine Woodcoin
-                          whirl       use to mine Whirlcoin
                          whirlpoolx  use to mine Vanillacoin
                          x11         use to mine DarkCoin
                          x14         use to mine X14Coin
@ -113,20 +111,23 @@ its command line interface and options.
  -T, --timeout=N       network timeout, in seconds (default: 270)
  -s, --scantime=N      upper bound on time spent scanning current work when
                        long polling is unavailable, in seconds (default: 5)
+  -n, --ndevs           list cuda devices
  -N, --statsavg        number of samples used to display hashrate (default: 30)
      --no-gbt          disable getblocktemplate support (height check in solo)
      --no-longpoll     disable X-Long-Polling support
      --no-stratum      disable X-Stratum support
  -q, --quiet           disable per-thread hashmeter output
+      --no-color        disable colored output
  -D, --debug           enable debug output
  -P, --protocol-dump   verbose dump of protocol-level activities
  -b, --api-bind        IP/Port for the miner API (default: 127.0.0.1:4068)
+      --api-remote      Allow remote control
+  -B, --background      run the miner in the background
      --benchmark       run in offline benchmark mode
      --cputest         debug hashes from cpu algorithms
      --cpu-affinity    set process affinity to specific cpu core(s) mask
      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)
  -c, --config=FILE     load a JSON-format configuration file
-      --no-color        disable colored console output
  -V, --version         display version information and exit
  -h, --help            display this help text and exit

@ -189,6 +190,9 @@ features.

 >>> RELEASE HISTORY <<<

+  Apr. 14th 2015  v1.6.1
+                  Add the Double Skein Algo for Woodcoin
+
  Mar. 27th 2015  v1.6.0
                  Add the ZR5 Algo for Ziftcoin
                  Implement Skeincoin algo (skein + sha)
@ -347,8 +351,8 @@ Christian Buchner, Christian H. (Germany): Initial CUDA implementation

 djm34, tsiv, sp and klausT for cuda algos implementation and optimisation

-Tanguy Pruvot : 750Ti tuning, blake, colors, general code cleanup/opts
-                API monitoring, linux Config/Makefile and vstudio stuff...
+Tanguy Pruvot : 750Ti tuning, blake, colors, zr5, skein, general code cleanup
+                API monitoring, linux Config/Makefile and vstudio libs...

 and also many thanks to anyone else who contributed to the original
 cpuminer application (Jeff Garzik, pooler), it's original HVC-fork
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -263,7 +263,6 @@ Options:\n\
 			x14         X14\n\
 			x15         X15\n\
 			x17         X17 (peoplecurrency)\n\
-			whirl       Whirlcoin (old whirlpool)\n\
 			whirlpoolx  Vanilla coin\n\
 			zr5         ZR5 (ZiftrCoin)\n\
  -d, --devices         Comma separated list of CUDA devices to use.\n\
@ -1396,7 +1395,7 @@ static void *miner_thread(void *userdata)
 			max64 = max(minmax-1, max64);
 		}

-		// we can't scan more than uint capacity
+		// we can't scan more than uint32 capacity
 		max64 = min(UINT32_MAX, max64);

 		start_nonce = nonceptr[0];
@ -1412,6 +1411,11 @@ static void *miner_thread(void *userdata)

 		// todo: keep it rounded for gpu threads ?

+		if (unlikely(start_nonce > max_nonce)) {
+			// should not happen but seen in skein2 benchmark with 2 gpus
+			max_nonce = end_nonce = UINT32_MAX;
+		}
+
 		work.scanned_from = start_nonce;
 		nonceptr[0] = start_nonce;

@ -1537,6 +1541,7 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;

+		/* to be deleted */
 		case ALGO_WHIRLCOIN:
 			rc = scanhash_whc(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.6.1-git])
+AC_INIT([ccminer], [1.6.1])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@ -686,7 +686,6 @@ void qubithash(void *state, const void *input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
 void s3hash(void *output, const void *input);
-void wcoinhash(void *state, const void *input);
 void whirlxHash(void *state, const void *input);
 void x11hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
--- a/quark/cuda_bmw512_30.cu
+++ b/quark/cuda_bmw512_30.cu
@ -12,8 +12,7 @@
 	q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
 	q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])

-static __constant__ uint64_t d_constMem[16];
-static uint64_t h_constMem[16] = {
+static __constant__ uint64_t d_constMem[16] = {
 	SPH_C64(0x8081828384858687),
 	SPH_C64(0x88898A8B8C8D8E8F),
 	SPH_C64(0x9091929394959697),
--- a/skein.cu
+++ b/skein.cu
@ -3,9 +3,7 @@
 * by tpruvot@github - 2015
 */

-extern "C" {
 #include "sph/sph_skein.h"
-}

 #include "miner.h"
 #include "cuda_helper.h"
@ -101,70 +99,10 @@ void sha256_transform_gpu(uint32_t *state, uint32_t *message)
 	}

 	/* 3. Mix. */
-	RNDr(S, W, 0);
-	RNDr(S, W, 1);
-	RNDr(S, W, 2);
-	RNDr(S, W, 3);
-	RNDr(S, W, 4);
-	RNDr(S, W, 5);
-	RNDr(S, W, 6);
-	RNDr(S, W, 7);
-	RNDr(S, W, 8);
-	RNDr(S, W, 9);
-	RNDr(S, W, 10);
-	RNDr(S, W, 11);
-	RNDr(S, W, 12);
-	RNDr(S, W, 13);
-	RNDr(S, W, 14);
-	RNDr(S, W, 15);
-	RNDr(S, W, 16);
-	RNDr(S, W, 17);
-	RNDr(S, W, 18);
-	RNDr(S, W, 19);
-	RNDr(S, W, 20);
-	RNDr(S, W, 21);
-	RNDr(S, W, 22);
-	RNDr(S, W, 23);
-	RNDr(S, W, 24);
-	RNDr(S, W, 25);
-	RNDr(S, W, 26);
-	RNDr(S, W, 27);
-	RNDr(S, W, 28);
-	RNDr(S, W, 29);
-	RNDr(S, W, 30);
-	RNDr(S, W, 31);
-	RNDr(S, W, 32);
-	RNDr(S, W, 33);
-	RNDr(S, W, 34);
-	RNDr(S, W, 35);
-	RNDr(S, W, 36);
-	RNDr(S, W, 37);
-	RNDr(S, W, 38);
-	RNDr(S, W, 39);
-	RNDr(S, W, 40);
-	RNDr(S, W, 41);
-	RNDr(S, W, 42);
-	RNDr(S, W, 43);
-	RNDr(S, W, 44);
-	RNDr(S, W, 45);
-	RNDr(S, W, 46);
-	RNDr(S, W, 47);
-	RNDr(S, W, 48);
-	RNDr(S, W, 49);
-	RNDr(S, W, 50);
-	RNDr(S, W, 51);
-	RNDr(S, W, 52);
-	RNDr(S, W, 53);
-	RNDr(S, W, 54);
-	RNDr(S, W, 55);
-	RNDr(S, W, 56);
-	RNDr(S, W, 57);
-	RNDr(S, W, 58);
-	RNDr(S, W, 59);
-	RNDr(S, W, 60);
-	RNDr(S, W, 61);
-	RNDr(S, W, 62);
-	RNDr(S, W, 63);
+	#pragma unroll
+	for (int i = 0; i < 64; i++) {
+		RNDr(S, W, i);
+	}

 	for (int i = 0; i < 8; i++)
 		state[i] += S[i];
@ -408,8 +346,8 @@ extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata,
 	const uint32_t first_nonce = pdata[19];
 	const int swap = 1;

-	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
-	throughput = min(throughput,  (max_nonce - first_nonce));
+	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
+	throughput = min(throughput, (max_nonce - first_nonce));

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x07;
--- a/skein2.cu
+++ b/skein2.cu
@ -1,6 +1,9 @@
-extern "C" {
+/**
+ * SKEIN512 80 + SKEIN512 64 (Woodcoin)
+ * by tpruvot@github - 2015
+ */
+
 #include "sph/sph_skein.h"
-}

 #include "miner.h"
 #include "cuda_helper.h"
@ -8,21 +11,19 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];

 extern void skein512_cpu_setBlock_80(void *pdata);
-extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);

 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

 extern "C" void skein2hash(void *output, const void *input)
 {
+	uint32_t _ALIGN(64) hash[16];
 	sph_skein512_context ctx_skein;

-	uint32_t hash[16];
-
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, input, 80);
 	sph_skein512_close(&ctx_skein, hash);
-	//applog_hash((uchar*)hash);
-	//applog_hash((uchar*)&hash[8]);
+
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, hash, 64);
 	sph_skein512_close(&ctx_skein, hash);
@ -30,21 +31,6 @@ extern "C" void skein2hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-#ifdef _DEBUG
-#define TRACE(algo) { \
-	if (max_nonce == 1 && pdata[19] <= 1) { \
-		uint32_t* debugbuf = NULL; \
-		cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \
-		cudaMemcpy(debugbuf, d_hash[thr_id], 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
-		printf("SK2 %s %08x %08x %08x %08x...\n", algo, \
-			swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3])); \
-		cudaFree(debugbuf); \
-		} \
-}
-#else
-#define TRACE(algo) {}
-#endif
-
 static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
@ -53,25 +39,25 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];

-	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
-	throughput = min(throughput,  (max_nonce - first_nonce));
+	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
+	throughput = min(throughput, (max_nonce - first_nonce));

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0FFF;
+		((uint32_t*)ptarget)[7] = 0;

 	if (!init[thr_id])
 	{
 		cudaDeviceReset();
 		cudaSetDevice(device_map[thr_id]);

-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64UL * throughput));

 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}

 	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
+	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);

 	skein512_cpu_setBlock_80((void*)endiandata);
@ -81,24 +67,22 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 		int order = 0;

 		// Hash with CUDA
-		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		TRACE("80:");
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 0);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		TRACE("64:");
+
+		*hashes_done = pdata[19] - first_nonce + throughput;

 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash64[8];

 			endiandata[19] = foundNonce;
 			skein2hash(vhash64, endiandata);

-			#define Htarg ptarget[7]
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				*hashes_done = pdata[19] - first_nonce + throughput;
 				if (secNonce != 0) {
 					if (!opt_quiet)
 						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", device_map[thr_id], swab32(secNonce));
@ -113,10 +97,13 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 			}
 		}

+		if (((uint64_t) throughput + pdata[19]) > max_nonce) {
+			break;
+		}
+
 		pdata[19] += throughput;

-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);

-	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
--- a/util.cpp
+++ b/util.cpp
@ -1797,9 +1797,6 @@ void print_hash_tests(void)
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);

-	wcoinhash(&hash[0], &buf[0]);
-	printpfx("whirl", hash);
-
 	whirlxHash(&hash[0], &buf[0]);
 	printpfx("whirlpoolx", hash);

--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -131,7 +131,7 @@ extern "C" void x11hash(void *output, const void *input)
 		uint32_t* debugbuf = NULL; \
 		cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
 		cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
-		printf("%s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+		printf("X11 %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
 			swab32(debugbuf[2]), swab32(debugbuf[3])); \
 		cudaFreeHost(debugbuf); \
 	} \
@ -209,7 +209,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("simd   :");
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		TRACE("echo X11 =>");
+		TRACE("echo => ");

 		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)