update README, small changes, prepare release 1.6.1

still need a SM 3.0 fix for skein...
10 years ago · d58d53f2b2
11 changed files with 63 additions and 129 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -7,8 +7,8 @@ else
 JANSSON_INCLUDES=
 endif
-EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
+EXTRA_DIST		= autogen.sh README.md README.txt LICENSE.txt \
-			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
+			  ccminer.sln ccminer.vcxproj ccminer.vcxproj.filters \
 			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
 SUBDIRS = compat
@ -121,3 +121,6 @@ JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
 skein.o: skein.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 skein2.o: skein2.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@ ccminer
 Based on Christian Buchner's &amp; Christian H.'s CUDA project, no more active on github recently.
-Fork by tpruvot@github with X14,X15,X17,WHIRL,Blake256,BlakeCoin,Lyra2RE and some others, check the [README.txt](README.txt)
+Fork by tpruvot@github with X14,X15,X17,Blake256,BlakeCoin,Lyra2RE,Skein,ZR5 and others, check the [README.txt](README.txt)
   BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
   [![tip for next commit](https://tip4commit.com/projects/927.svg)](https://tip4commit.com/github/tpruvot/ccminer)
@ -30,5 +30,7 @@ The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64
 To rebuild them, you need to clone this repository and its submodules :
    git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows
-There is also a [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website.
+On Linux, you can use the helper ./build.sh (edit it if required)
 There is also an old [Tutorial for windows](http://cudamining.co.uk/url/tutorials/id/3) on [CudaMining](http://cudamining.co.uk) website.
--- a/README.txt
+++ b/README.txt
@ -1,16 +1,15 @@
-ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX"
+ccMiner release 1.6.1-tpruvot (Apr 2015) - "Skein512 Chainsaw"
 ---------------------------------------------------------------
 ***************************************************************
-If you find this tool useful and like to support its continued 
+If you find this tool useful and like to support its continuous
          development, then consider a donation.
 tpruvot@github:
-  BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
+  BTC  : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo
  DRK  : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3
-  NEOS : NaEcVrdzoCWHUYXb7X8QoafoKS9UV69Yk4
+  ZRC  : ZEcubH2xp2mpuwxMjy7wZThr5AzLGu3mqT
  XST  : S9TqZucWgT6ajZLDBxQnHUtmkotCEHn9z9
 DJM34:
  BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze
@ -56,7 +55,7 @@ that the most of our comments are in german.
 >>> Command Line Interface <<<
-This code is based on the pooler cpuminer 2.3.2 release and inherits
+This code is based on the pooler cpuminer and inherits
 its command line interface and options.
  -a, --algo=ALGO       specify the algorithm to use
@ -83,7 +82,6 @@ its command line interface and options.
                          s3          use to mine 1coin
                          skein       use to mine Skeincoin
                          skein2      use to mine Woodcoin
                          whirl       use to mine Whirlcoin
                          whirlpoolx  use to mine Vanillacoin
                          x11         use to mine DarkCoin
                          x14         use to mine X14Coin
@ -113,20 +111,23 @@ its command line interface and options.
  -T, --timeout=N       network timeout, in seconds (default: 270)
  -s, --scantime=N      upper bound on time spent scanning current work when
                        long polling is unavailable, in seconds (default: 5)
  -n, --ndevs           list cuda devices
  -N, --statsavg        number of samples used to display hashrate (default: 30)
      --no-gbt          disable getblocktemplate support (height check in solo)
      --no-longpoll     disable X-Long-Polling support
      --no-stratum      disable X-Stratum support
  -q, --quiet           disable per-thread hashmeter output
      --no-color        disable colored output
  -D, --debug           enable debug output
  -P, --protocol-dump   verbose dump of protocol-level activities
  -b, --api-bind        IP/Port for the miner API (default: 127.0.0.1:4068)
      --api-remote      Allow remote control
  -B, --background      run the miner in the background
      --benchmark       run in offline benchmark mode
      --cputest         debug hashes from cpu algorithms
      --cpu-affinity    set process affinity to specific cpu core(s) mask
      --cpu-priority    set process priority (default: 0 idle, 2 normal to 5 highest)
  -c, --config=FILE     load a JSON-format configuration file
      --no-color        disable colored console output
  -V, --version         display version information and exit
  -h, --help            display this help text and exit
@ -189,6 +190,9 @@ features.
 >>> RELEASE HISTORY <<<
  Apr. 14th 2015  v1.6.1
                  Add the Double Skein Algo for Woodcoin
  Mar. 27th 2015  v1.6.0
                  Add the ZR5 Algo for Ziftcoin
                  Implement Skeincoin algo (skein + sha)
@ -347,8 +351,8 @@ Christian Buchner, Christian H. (Germany): Initial CUDA implementation
 djm34, tsiv, sp and klausT for cuda algos implementation and optimisation
-Tanguy Pruvot : 750Ti tuning, blake, colors, general code cleanup/opts
+Tanguy Pruvot : 750Ti tuning, blake, colors, zr5, skein, general code cleanup
-                API monitoring, linux Config/Makefile and vstudio stuff...
+                API monitoring, linux Config/Makefile and vstudio libs...
 and also many thanks to anyone else who contributed to the original
 cpuminer application (Jeff Garzik, pooler), it's original HVC-fork
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -263,7 +263,6 @@ Options:\n\
 			x14         X14\n\
 			x15         X15\n\
 			x17         X17 (peoplecurrency)\n\
 			whirl       Whirlcoin (old whirlpool)\n\
 			whirlpoolx  Vanilla coin\n\
 			zr5         ZR5 (ZiftrCoin)\n\
  -d, --devices         Comma separated list of CUDA devices to use.\n\
@ -1396,7 +1395,7 @@ static void *miner_thread(void *userdata)
 			max64 = max(minmax-1, max64);
 		}
-		// we can't scan more than uint capacity
+		// we can't scan more than uint32 capacity
 		max64 = min(UINT32_MAX, max64);
 		start_nonce = nonceptr[0];
@ -1412,6 +1411,11 @@ static void *miner_thread(void *userdata)
 		// todo: keep it rounded for gpu threads ?
 		if (unlikely(start_nonce > max_nonce)) {
 			// should not happen but seen in skein2 benchmark with 2 gpus
 			max_nonce = end_nonce = UINT32_MAX;
 		}
 		work.scanned_from = start_nonce;
 		nonceptr[0] = start_nonce;
@ -1537,6 +1541,7 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 		/* to be deleted */
 		case ALGO_WHIRLCOIN:
 			rc = scanhash_whc(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.6.1-git])
+AC_INIT([ccminer], [1.6.1])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/miner.h
+++ b/miner.h
@ -686,7 +686,6 @@ void qubithash(void *state, const void *input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
 void s3hash(void *output, const void *input);
 void wcoinhash(void *state, const void *input);
 void whirlxHash(void *state, const void *input);
 void x11hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
--- a/quark/cuda_bmw512_30.cu
+++ b/quark/cuda_bmw512_30.cu
@ -12,8 +12,7 @@
 	q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
 	q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
-static __constant__ uint64_t d_constMem[16];
+static __constant__ uint64_t d_constMem[16] = {
 static uint64_t h_constMem[16] = {
 	SPH_C64(0x8081828384858687),
 	SPH_C64(0x88898A8B8C8D8E8F),
 	SPH_C64(0x9091929394959697),
--- a/skein.cu
+++ b/skein.cu
@ -3,9 +3,7 @@
 * by tpruvot@github - 2015
 */
 extern "C" {
 #include "sph/sph_skein.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
@ -101,70 +99,10 @@ void sha256_transform_gpu(uint32_t *state, uint32_t *message)
 	}
 	/* 3. Mix. */
-	RNDr(S, W, 0);
+	#pragma unroll
-	RNDr(S, W, 1);
+	for (int i = 0; i < 64; i++) {
-	RNDr(S, W, 2);
+		RNDr(S, W, i);
-	RNDr(S, W, 3);
+	}
 	RNDr(S, W, 4);
 	RNDr(S, W, 5);
 	RNDr(S, W, 6);
 	RNDr(S, W, 7);
 	RNDr(S, W, 8);
 	RNDr(S, W, 9);
 	RNDr(S, W, 10);
 	RNDr(S, W, 11);
 	RNDr(S, W, 12);
 	RNDr(S, W, 13);
 	RNDr(S, W, 14);
 	RNDr(S, W, 15);
 	RNDr(S, W, 16);
 	RNDr(S, W, 17);
 	RNDr(S, W, 18);
 	RNDr(S, W, 19);
 	RNDr(S, W, 20);
 	RNDr(S, W, 21);
 	RNDr(S, W, 22);
 	RNDr(S, W, 23);
 	RNDr(S, W, 24);
 	RNDr(S, W, 25);
 	RNDr(S, W, 26);
 	RNDr(S, W, 27);
 	RNDr(S, W, 28);
 	RNDr(S, W, 29);
 	RNDr(S, W, 30);
 	RNDr(S, W, 31);
 	RNDr(S, W, 32);
 	RNDr(S, W, 33);
 	RNDr(S, W, 34);
 	RNDr(S, W, 35);
 	RNDr(S, W, 36);
 	RNDr(S, W, 37);
 	RNDr(S, W, 38);
 	RNDr(S, W, 39);
 	RNDr(S, W, 40);
 	RNDr(S, W, 41);
 	RNDr(S, W, 42);
 	RNDr(S, W, 43);
 	RNDr(S, W, 44);
 	RNDr(S, W, 45);
 	RNDr(S, W, 46);
 	RNDr(S, W, 47);
 	RNDr(S, W, 48);
 	RNDr(S, W, 49);
 	RNDr(S, W, 50);
 	RNDr(S, W, 51);
 	RNDr(S, W, 52);
 	RNDr(S, W, 53);
 	RNDr(S, W, 54);
 	RNDr(S, W, 55);
 	RNDr(S, W, 56);
 	RNDr(S, W, 57);
 	RNDr(S, W, 58);
 	RNDr(S, W, 59);
 	RNDr(S, W, 60);
 	RNDr(S, W, 61);
 	RNDr(S, W, 62);
 	RNDr(S, W, 63);
 	for (int i = 0; i < 8; i++)
 		state[i] += S[i];
@ -408,8 +346,8 @@ extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata,
 	const uint32_t first_nonce = pdata[19];
 	const int swap = 1;
-	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
+	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
-	throughput = min(throughput,  (max_nonce - first_nonce));
+	throughput = min(throughput, (max_nonce - first_nonce));
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x07;
--- a/skein2.cu
+++ b/skein2.cu
@ -1,6 +1,9 @@
-extern "C" {
+/**
 * SKEIN512 80 + SKEIN512 64 (Woodcoin)
 * by tpruvot@github - 2015
 */
 #include "sph/sph_skein.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
@ -8,21 +11,19 @@ extern "C" {
 static uint32_t *d_hash[MAX_GPUS];
 extern void skein512_cpu_setBlock_80(void *pdata);
-extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern "C" void skein2hash(void *output, const void *input)
 {
 	uint32_t _ALIGN(64) hash[16];
 	sph_skein512_context ctx_skein;
 	uint32_t hash[16];
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, input, 80);
 	sph_skein512_close(&ctx_skein, hash);
-	//applog_hash((uchar*)hash);
+
 	//applog_hash((uchar*)&hash[8]);
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, hash, 64);
 	sph_skein512_close(&ctx_skein, hash);
@ -30,21 +31,6 @@ extern "C" void skein2hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 #ifdef _DEBUG
 #define TRACE(algo) { \
 	if (max_nonce == 1 && pdata[19] <= 1) { \
 		uint32_t* debugbuf = NULL; \
 		cudaMallocHost(&debugbuf, 16*sizeof(uint32_t)); \
 		cudaMemcpy(debugbuf, d_hash[thr_id], 16*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
 		printf("SK2 %s %08x %08x %08x %08x...\n", algo, \
 			swab32(debugbuf[0]), swab32(debugbuf[1]), swab32(debugbuf[2]), swab32(debugbuf[3])); \
 		cudaFree(debugbuf); \
 		} \
 }
 #else
 #define TRACE(algo) {}
 #endif
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
@ -53,25 +39,25 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
-	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
+	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
-	throughput = min(throughput,  (max_nonce - first_nonce));
+	throughput = min(throughput, (max_nonce - first_nonce));
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0FFF;
+		((uint32_t*)ptarget)[7] = 0;
 	if (!init[thr_id])
 	{
 		cudaDeviceReset();
 		cudaSetDevice(device_map[thr_id]);
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64UL * throughput));
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
+	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	skein512_cpu_setBlock_80((void*)endiandata);
@ -81,24 +67,22 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 		int order = 0;
 		// Hash with CUDA
-		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 0);
 		TRACE("80:");
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		TRACE("64:");
+
 		*hashes_done = pdata[19] - first_nonce + throughput;
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash64[8];
 			endiandata[19] = foundNonce;
 			skein2hash(vhash64, endiandata);
-			#define Htarg ptarget[7]
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				if (secNonce != 0) {
 					if (!opt_quiet)
 						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", device_map[thr_id], swab32(secNonce));
@ -113,10 +97,13 @@ extern "C" int scanhash_skein2(int thr_id, uint32_t *pdata,
 			}
 		}
 		if (((uint64_t) throughput + pdata[19]) > max_nonce) {
 			break;
 		}
 		pdata[19] += throughput;
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
--- a/util.cpp
+++ b/util.cpp
@ -1797,9 +1797,6 @@ void print_hash_tests(void)
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);
 	wcoinhash(&hash[0], &buf[0]);
 	printpfx("whirl", hash);
 	whirlxHash(&hash[0], &buf[0]);
 	printpfx("whirlpoolx", hash);
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -131,7 +131,7 @@ extern "C" void x11hash(void *output, const void *input)
 		uint32_t* debugbuf = NULL; \
 		cudaMallocHost(&debugbuf, 8*sizeof(uint32_t)); \
 		cudaMemcpy(debugbuf, d_hash[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); \
-		printf("%s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+		printf("X11 %s %08x %08x %08x %08x...\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
 			swab32(debugbuf[2]), swab32(debugbuf[3])); \
 		cudaFreeHost(debugbuf); \
 	} \
@ -209,7 +209,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("simd   :");
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		TRACE("echo X11 =>");
+		TRACE("echo => ");
 		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)