Merge branch 'blake-dev' into blake

2025-03-10 04:21:12 +00:00 · 2014-09-09 22:02:19 +02:00 · 2014-09-09 22:02:19 +02:00 · 474ee97d6e
commit 474ee97d6e
parent 0914c7095c 429266346c
19 changed files with 974 additions and 289 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -16,7 +16,7 @@ bin_PROGRAMS	= ccminer
 ccminer_SOURCES		= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
-			  cpu-miner.c util.c hefty1.c scrypt.c \
+			  cpu-miner.c util.c crc32.c hefty1.c scrypt.c \
 			  hashlog.cpp \
 			  heavy/heavy.cu \
 			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
@ -33,7 +33,7 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu \
-			  cuda_nist5.cu blake32.cu \
+			  cuda_nist5.cu blake32.cu pentablake.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@

-ccMiner release 1.4.1-tpruvot (Sep 06th 2014) - "Cached Blake"
+ccMiner release 1.4.2-tpruvot (Sep 09th 2014) - "Pentablake"
 ---------------------------------------------------------------

 ***************************************************************
@ -35,6 +35,7 @@ TalkCoin
 DarkCoin and other X11 coins
 NEOS blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
+Pentablake (Blake 512 x5)

 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL) implementations.
@ -65,6 +66,7 @@ its command line interface and options.
                          blake       use to mine NEOS (Blake 256)
                          blakecoin   use to mine Old Blake 256
                          nist5       use to mine TalkCoin
+                          penta       use to mine Joincoin / Pentablake
                          fresh       use to mine Freshcoin
                          whirl       use to mine Whirlcoin
                          x11         use to mine DarkCoin
--- a/blake32.cu
+++ b/blake32.cu
@ -15,6 +15,9 @@ extern "C" {
 /* threads per block */
 #define TPB 128

+/* crc32.c */
+extern "C" uint32_t crc32_u32t(const uint32_t *buf, size_t size);
+
 extern "C" int blake256_rounds = 14;

 /* hash by cpu with blake 256 */
@ -41,8 +44,6 @@ extern bool opt_n_threads;
 extern bool opt_benchmark;
 extern int device_map[8];

-uint32_t crc32(const uint32_t *buf, size_t size);
-
 __constant__
 static uint32_t __align__(32) c_Target[8];

@ -331,7 +332,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt

 	blake256_cpu_setBlock_80(pdata, ptarget);
 #if USE_CACHE
-	crcsum = crc32(pdata, 64);
+	crcsum = crc32_u32t(pdata, 64);
 #endif

 	do {
@ -360,7 +361,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 					be32enc(&endiandata[19], extra_results[0]);
 					blake256hash(vhashcpu, endiandata, blakerounds);
 					if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
-						applog(LOG_NOTICE, "GPU found more than one result yippee!");
+						applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!");
+						rc = 2;
 					} else {
 						extra_results[0] = MAXU;
 					}
@ -379,9 +381,14 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 			}
 		}

+		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
+			pdata[19] = max_nonce - first_nonce + 1;
+			break;
+		}
+
 		pdata[19] += throughput;

-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);

 exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1;
@ -394,66 +401,6 @@ exit_scan:
 	}
 #endif
 	// wait proper end of all threads
-	cudaDeviceSynchronize();
+	//cudaDeviceSynchronize();
 	return rc;
 }
-
-static uint32_t crc32_tab[] = {
-	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-	0xe963a535, 0x9e6495a3,	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-	0xf3b97148, 0x84be41de,	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,	0x14015c4f, 0x63066cd9,
-	0xfa0f3d63, 0x8d080df5,	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,	0x35b5a8fa, 0x42b2986c,
-	0xdbbbc9d6, 0xacbcf940,	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-	0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,	0x76dc4190, 0x01db7106,
-	0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-	0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-	0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-	0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-	0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-	0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-	0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-	0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-	0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-	0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-	0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-	0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-	0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-	0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-	0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-	0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-};
-
-uint32_t crc32(const uint32_t *buf, size_t size)
-{
-	const uint8_t *p;
-	uint32_t crc = 0;
-
-	p = (uint8_t *) buf;
-	crc = crc ^ ~0U;
-
-	while (size--)
-		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
-
-	return crc ^ ~0U;
-}
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -241,6 +241,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
      <TreatWChar_tAsBuiltInType>false</TreatWChar_tAsBuiltInType>
      <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
    </ClCompile>
+    <ClCompile Include="crc32.c" />
    <ClCompile Include="fuguecoin.cpp" />
    <ClCompile Include="groestlcoin.cpp" />
    <ClCompile Include="hashlog.cpp" />
@ -404,6 +405,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
      <FastMath>true</FastMath>
    </CudaCompile>
+    <CudaCompile Include="pentablake.cu">
+      <MaxRegCount>80</MaxRegCount>
+      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
+      <FastMath>true</FastMath>
+    </CudaCompile>
    <CudaCompile Include="quark\animecoin.cu">
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -96,6 +96,9 @@
    <ClCompile Include="cpu-miner.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="crc32.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
    <ClCompile Include="hefty1.c">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -442,5 +445,8 @@
    <CudaCompile Include="blake32.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
+    <CudaCompile Include="pentablake.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.09.06])
+AC_INIT([ccminer], [2014.09.09])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -136,8 +136,9 @@ typedef enum {
 	ALGO_JACKPOT,
 	ALGO_MJOLLNIR,		/* Mjollnir hash */
 	ALGO_MYR_GR,
-	ALGO_QUARK,
 	ALGO_NIST5,
+	ALGO_PENTABLAKE,
+	ALGO_QUARK,
 	ALGO_WHC,
 	ALGO_X11,
 	ALGO_X13,
@ -159,6 +160,7 @@ static const char *algo_names[] = {
 	"mjollnir",
 	"myr-gr",
 	"nist5",
+	"penta",
 	"quark",
 	"whirl",
 	"x11",
@ -170,7 +172,6 @@ static const char *algo_names[] = {
 };

 bool opt_debug = false;
-bool opt_debug_rpc = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
 bool want_longpoll = true;
@ -179,7 +180,7 @@ bool want_stratum = true;
 bool have_stratum = false;
 static bool submit_old = false;
 bool use_syslog = false;
-bool use_colors = false;
+bool use_colors = true;
 static bool opt_background = false;
 bool opt_quiet = false;
 static int opt_retries = -1;
@ -242,6 +243,7 @@ Options:\n\
                        mjollnir  Mjollnircoin hash\n\
                        myr-gr    Myriad-Groestl hash\n\
                        nist5     NIST5 (TalkCoin) hash\n\
+                        penta     Pentablake hash (5x Blake 512)\n\
                        quark     Quark hash\n\
                        whirl     Whirlcoin (old whirlpool)\n\
                        x11       X11 (DarkCoin) hash\n\
@ -437,7 +439,7 @@ static int share_result(int result, const char *reason)
 				(result ? CL_GRN "yay!!!" : CL_RED "booooo")
 			:	(result ? "(yay!!!)" : "(booooo)"));

-	if (reason && !opt_quiet) {
+	if (reason) {
 		applog(LOG_WARNING, "reject reason: %s", reason);
 		if (strncmp(reason, "low difficulty share", 20) == 0) {
 			opt_difficulty = (opt_difficulty * 2.0) / 3.0;
@ -457,11 +459,14 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 	bool rc = false;

 	/* pass if the previous hash is not the current previous hash */
+	pthread_mutex_lock(&g_work_lock);
 	if (memcmp(work->data + 1, g_work.data + 1, 32)) {
+		pthread_mutex_unlock(&g_work_lock);
 		if (opt_debug)
 			applog(LOG_DEBUG, "DEBUG: stale work detected, discarding");
 		return true;
 	}
+	pthread_mutex_unlock(&g_work_lock);

 	if (have_stratum) {
 		uint32_t sent;
@ -544,10 +549,6 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		json_decref(val);
 	}

-	if (opt_debug_rpc) {
-		applog(LOG_DEBUG, "submit: %s", s);
-	}
-
 	rc = true;

 out:
@ -786,13 +787,20 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);

 	/* Generate merkle root */
-	if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
-		heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
-	else
-	if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_WHC || opt_algo == ALGO_BLAKECOIN)
-		SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root);
-	else
-		sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+	switch (opt_algo) {
+		case ALGO_HEAVY:
+		case ALGO_MJOLLNIR:
+			heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+			break;
+		case ALGO_FUGUE256:
+		case ALGO_GROESTL:
+		case ALGO_BLAKECOIN:
+		case ALGO_WHC:
+			SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root);
+			break;
+		default:
+			sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+	}

 	for (i = 0; i < sctx->job.merkle_count; i++) {
 		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
@ -864,7 +872,9 @@ static void *miner_thread(void *userdata)
 	uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
 	unsigned char *scratchbuf = NULL;
 	bool work_done = false;
+	bool extrajob = false;
 	char s[16];
+	int rc = 0;

 	memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized

@ -891,20 +901,31 @@ static void *miner_thread(void *userdata)
 		struct timeval tv_start, tv_end, diff;
 		int64_t max64;
 		uint64_t umax64;
-		int rc;

 		// &work.data[19]
 		int wcmplen = 76;
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);

 		if (have_stratum) {
-			while (time(NULL) >= (g_work_time + opt_scantime) && !work_done)
-				usleep(500*1000);
-			work_done = false;
-			pthread_mutex_lock(&g_work_lock);
+			uint32_t sleeptime = 0;
+			while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
+				usleep(100*1000);
+				if (sleeptime > 4) {
+					extrajob = true;
+					break;
+				}
+				sleeptime++;
+			}
+			if (sleeptime && opt_debug && !opt_quiet)
+				applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
 			nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
-			if ((*nonceptr) >= end_nonce)
+			pthread_mutex_lock(&g_work_lock);
+			extrajob |= work_done;
+			if ((*nonceptr) >= end_nonce || extrajob) {
+				work_done = false;
+				extrajob = false;
 				stratum_gen_work(&stratum, &g_work);
+			}
 		} else {
 			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
 			/* obtain new work from internal workio thread */
@ -920,34 +941,54 @@ static void *miner_thread(void *userdata)
 				g_work_time = time(NULL);
 			}
 		}
+#if 0
+		if (!opt_benchmark && g_work.xnonce2_len == 0) {
+			applog(LOG_ERR, "work data not read yet");
+			extrajob = true;
+			work_done = true;
+			sleep(1);
+			continue;
+		}
+#endif
+		if (rc > 1) {
+			/* if we found more than one on last loop */
+			/* todo: handle an array to get them directly */
+			pthread_mutex_unlock(&g_work_lock);
+			goto continue_scan;
+		}
+
+		if (memcmp(work.target, g_work.target, sizeof(work.target))) {
+			if (opt_debug) {
+				applog(LOG_DEBUG, "job %s target change:", g_work.job_id);
+				applog_hash((uint8_t*) work.target);
+				applog_compare_hash((uint8_t*) g_work.target, (uint8_t*) work.target);
+			}
+			memcpy(work.target, g_work.target, sizeof(work.target));
+			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
+			/* on new target, ignoring nonce, clear sent data (hashlog) */
+			if (memcmp(work.target, g_work.target, sizeof(work.target))) {
+				hashlog_purge_job(work.job_id);
+			}
+		}
 		if (memcmp(work.data, g_work.data, wcmplen)) {
 			if (opt_debug) {
-				applog(LOG_DEBUG, "job %s work updated", g_work.job_id);
-				for (int n=0; n<wcmplen; n+=8) {
+				for (int n=0; n <= (wcmplen-8); n+=8) {
 					if (memcmp(work.data + n, g_work.data + n, 8)) {
-						applog(LOG_DEBUG, "diff detected at offset %d", n);
+						applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n);
 						applog_hash((uint8_t*) work.data + n);
-						applog_hash((uint8_t*) g_work.data + n);
+						applog_compare_hash((uint8_t*) g_work.data + n, (uint8_t*) work.data + n);
 					}
 				}
 			}
 			memcpy(&work, &g_work, sizeof(struct work));
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
-		} else if (memcmp(work.target, g_work.target, sizeof(work.target))) {
-			if (opt_debug) {
-				applog(LOG_DEBUG, "job %s target change", g_work.job_id);
-				applog_hash((uint8_t*) work.target);
-				applog_hash((uint8_t*) g_work.target);
-			}
-			memcpy(work.target, g_work.target, sizeof(work.target));
-			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
 		} else
 			(*nonceptr)++; //??
-		pthread_mutex_unlock(&g_work_lock);
 		work_restart[thr_id].restart = 0;

 		if (opt_debug)
 			applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr));
+		pthread_mutex_unlock(&g_work_lock);

 		/* adjust max_nonce to meet target scan time */
 		if (have_stratum)
@ -959,15 +1000,18 @@ static void *miner_thread(void *userdata)
 		max64 *= (int64_t)thr_hashrates[thr_id];

 		if (max64 <= 0) {
+			/* should not be set too high,
+			   else you can miss multiple nounces */
 			switch (opt_algo) {
 			case ALGO_JACKPOT:
 				max64 = 0x1fffLL;
 				break;
 			case ALGO_BLAKECOIN:
 				max64 = 0x3ffffffLL;
+				break;
 			case ALGO_BLAKE:
 				/* based on the 750Ti hashrate (100kH) */
-				max64 = 0x3ffffffLL;
+				max64 = 0x1ffffffLL;
 				break;
 			default:
 				max64 = 0xfffffLL;
@ -1000,12 +1044,12 @@ static void *miner_thread(void *userdata)
 				stall |= (start_nonce > range.scanned[0] && start_nonce < range.scanned[1]);

 				if (stall) {
-					if (opt_algo)
+					if (opt_debug && !opt_quiet)
 						applog(LOG_DEBUG, "job done, wait for a new one...");
 					work_restart[thr_id].restart = 1;
 					hashlog_purge_old();
 					// wait a bit for a new job...
-					sleep(1);
+					usleep(500*1000);
 					(*nonceptr) = end_nonce + 1;
 					work_done = true;
 					continue;
@ -1023,6 +1067,7 @@ static void *miner_thread(void *userdata)
 		(*nonceptr) = start_nonce;

 		hashes_done = 0;
+continue_scan:
 		gettimeofday(&tv_start, NULL);

 		/* scan nonces for a proof-of-work hash */
@ -1089,6 +1134,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;

+		case ALGO_PENTABLAKE:
+			rc = scanhash_pentablake(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
 		case ALGO_WHC:
 			rc = scanhash_whc(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
@ -1133,8 +1183,11 @@ static void *miner_thread(void *userdata)
 		timeval_subtract(&diff, &tv_end, &tv_start);
 		if (diff.tv_usec || diff.tv_sec) {
 			pthread_mutex_lock(&stats_lock);
-			thr_hashrates[thr_id] =
-				hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
+			if (diff.tv_sec + 1e-6 * diff.tv_usec > 0.0) {
+				thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
+				if (rc > 1)
+					thr_hashrates[thr_id] = (rc * hashes_done) / (diff.tv_sec + 1e-6 * diff.tv_usec);
+			}
 			pthread_mutex_unlock(&stats_lock);
 		}
 		if (!opt_quiet) {
@ -1342,7 +1395,6 @@ static void *stratum_thread(void *userdata)
 			pthread_mutex_lock(&g_work_lock);
 			stratum_gen_work(&stratum, &g_work);
 			time(&g_work_time);
-			pthread_mutex_unlock(&g_work_lock);
 			if (stratum.job.clean) {
 				if (!opt_quiet)
 					applog(LOG_BLUE, "%s send a new %s block %d", short_url, algo_names[opt_algo],
@ -1353,6 +1405,7 @@ static void *stratum_thread(void *userdata)
 					applog(LOG_BLUE, "%s send job %d for block %d", short_url,
 						strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
 			}
+			pthread_mutex_unlock(&g_work_lock);
 		}
 		
 		if (!stratum_socket_full(&stratum, 120)) {
@ -1374,7 +1427,7 @@ out:
 	return NULL;
 }

-#define PROGRAM_VERSION "1.4.1"
+#define PROGRAM_VERSION "1.4.2"
 static void show_version_and_exit(void)
 {
 	printf("%s v%s\n"
@ -1438,12 +1491,11 @@ static void parse_arg (int key, char *arg)
 	case 'C':
 		use_colors = true;
 		break;
-	case 'q':
-		opt_quiet = true;
-		break;
 	case 'D':
 		opt_debug = true;
-		opt_debug_rpc = true;
+		break;
+	case 'q':
+		opt_quiet = true;
 		break;
 	case 'p':
 		free(rpc_pass);
--- a/crc32.c
+++ b/crc32.c
@ -0,0 +1,119 @@
+/*-
+ *  COPYRIGHT (C) 1986 Gary S. Brown.  You may use this program, or
+ *  code or tables extracted from it, as desired without restriction.
+ *
+ *  First, the polynomial itself and its table of feedback terms.  The
+ *  polynomial is
+ *  X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0
+ *
+ *  Note that we take it "backwards" and put the highest-order term in
+ *  the lowest-order bit.  The X^32 term is "implied"; the LSB is the
+ *  X^31 term, etc.  The X^0 term (usually shown as "+1") results in
+ *  the MSB being 1
+ *
+ *  Note that the usual hardware shift register implementation, which
+ *  is what we're using (we're merely optimizing it by doing eight-bit
+ *  chunks at a time) shifts bits into the lowest-order term.  In our
+ *  implementation, that means shifting towards the right.  Why do we
+ *  do it this way?  Because the calculated CRC must be transmitted in
+ *  order from highest-order term to lowest-order term.  UARTs transmit
+ *  characters in order from LSB to MSB.  By storing the CRC this way
+ *  we hand it to the UART in the order low-byte to high-byte; the UART
+ *  sends each low-bit to hight-bit; and the result is transmission bit
+ *  by bit from highest- to lowest-order term without requiring any bit
+ *  shuffling on our part.  Reception works similarly
+ *
+ *  The feedback terms table consists of 256, 32-bit entries.  Notes
+ *
+ *      The table can be generated at runtime if desired; code to do so
+ *      is shown later.  It might not be obvious, but the feedback
+ *      terms simply represent the results of eight shift/xor opera
+ *      tions for all combinations of data and CRC register values
+ *
+ *      The values must be right-shifted by eight bits by the "updcrc
+ *      logic; the shift must be unsigned (bring in zeroes).  On some
+ *      hardware you could probably optimize the shift in assembler by
+ *      using byte-swap instructions
+ *      polynomial $edb88320
+ *
+ *
+ * CRC32 code derived from work by Gary S. Brown.
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+static uint32_t crc32_tab[] = {
+	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
+	0xe963a535, 0x9e6495a3,	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
+	0xf3b97148, 0x84be41de,	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,	0x14015c4f, 0x63066cd9,
+	0xfa0f3d63, 0x8d080df5,	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,	0x35b5a8fa, 0x42b2986c,
+	0xdbbbc9d6, 0xacbcf940,	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
+	0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,	0x76dc4190, 0x01db7106,
+	0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
+	0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+	0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
+	0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
+	0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
+	0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
+	0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+	0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
+	0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
+	0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
+	0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
+	0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+	0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
+	0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
+	0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
+	0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
+};
+
+/* Real CRC32 Function */
+extern uint32_t crc32(uint32_t crc, const void *buf, size_t size)
+{
+	const uint8_t *p;
+
+	p = buf;
+	crc = crc ^ ~0U;
+
+	while (size--)
+		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+
+	return crc ^ ~0U;
+}
+
+/* CRC32 Function simplified for ccminer */
+extern uint32_t crc32_u32t(const uint32_t *buf, size_t size)
+{
+	const uint8_t *p;
+	uint32_t crc = 0;
+
+	p = (uint8_t *) buf;
+	crc = crc ^ ~0U;
+
+	while (size--)
+		crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
+
+	return crc ^ ~0U;
+}
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -12,6 +12,8 @@

 #include <stdint.h>

+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
 extern __device__ __device_builtin__ void __syncthreads(void);

 #ifndef __CUDA_ARCH__
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -5,10 +5,12 @@ extern "C"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
-#include "miner.h"
-#include "cuda_helper.h"
 }

+#include "miner.h"
+
+#include "cuda_helper.h"
+
 // aus cpu-miner.c
 extern int device_map[8];

@ -74,9 +76,6 @@ extern "C" void nist5hash(void *state, const void *input)
    memcpy(state, hash, 32);
 }

-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
    unsigned long *hashes_done)
@ -84,7 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t first_nonce = pdata[19];

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		((uint32_t*)ptarget)[7] = 0x00FF;

 	const uint32_t Htarg = ptarget[7];

--- a/miner.h
+++ b/miner.h
@ -249,6 +249,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);

+extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern int scanhash_whc(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -284,8 +288,8 @@ struct work_restart {
 	char			padding[128 - sizeof(unsigned long)];
 };

+extern bool opt_benchmark;
 extern bool opt_debug;
-extern bool opt_debug_rpc;
 extern bool opt_quiet;
 extern bool opt_protocol;
 extern int opt_timeout;
@ -417,6 +421,7 @@ size_t time2str(char* buf, time_t timer);
 char* atime2str(time_t timer);

 void applog_hash(unsigned char *hash);
+void applog_compare_hash(unsigned char *hash, unsigned char *hash2);

 void print_hash_tests(void);
 void animehash(void *state, const void *input);
@ -428,6 +433,7 @@ unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void nist5hash(void *state, const void *input);
+void pentablakehash(void *output, const void *input);
 void quarkhash(void *state, const void *input);
 void wcoinhash(void *state, const void *input);
 void x11hash(void *output, const void *input);
--- a/pentablake.cu
+++ b/pentablake.cu
@ -0,0 +1,600 @@
+/**
+ * Penta Blake-512 Cuda Kernel (Tested on SM 5.0)
+ *
+ * Tanguy Pruvot - Aug. 2014
+ */
+
+#include "miner.h"
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include <stdint.h>
+#include <memory.h>
+}
+
+/* threads per block */
+#define TPB 192
+
+/* hash by cpu with blake 256 */
+extern "C" void pentablakehash(void *output, const void *input)
+{
+	unsigned char hash[128];
+	#define hashB hash + 64
+	sph_blake512_context ctx;
+
+	sph_blake512_init(&ctx);
+	sph_blake512(&ctx, input, 80);
+	sph_blake512_close(&ctx, hash);
+
+	sph_blake512(&ctx, hash, 64);
+	sph_blake512_close(&ctx, hashB);
+
+	sph_blake512(&ctx, hashB, 64);
+	sph_blake512_close(&ctx, hash);
+
+	sph_blake512(&ctx, hash, 64);
+	sph_blake512_close(&ctx, hashB);
+
+	sph_blake512(&ctx, hashB, 64);
+	sph_blake512_close(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+#include "cuda_helper.h"
+
+#define MAXU 0xffffffffU
+
+// in cpu-miner.c
+extern bool opt_n_threads;
+extern bool opt_benchmark;
+extern int device_map[8];
+
+__constant__
+static uint32_t __align__(32) c_Target[8];
+
+__constant__
+static uint64_t __align__(32) c_data[32];
+
+static uint32_t *d_hash[8];
+static uint32_t *d_resNounce[8];
+static uint32_t *h_resNounce[8];
+static uint32_t extra_results[2] = { MAXU, MAXU };
+
+/* prefer uint32_t to prevent size conversions = speed +5/10 % */
+__constant__
+static uint32_t __align__(32) c_sigma[16][16];
+const uint32_t host_sigma[16][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+__device__ __constant__
+static const uint64_t __align__(32) c_IV512[8] = {
+	0x6a09e667f3bcc908ULL,
+	0xbb67ae8584caa73bULL,
+	0x3c6ef372fe94f82bULL,
+	0xa54ff53a5f1d36f1ULL,
+	0x510e527fade682d1ULL,
+	0x9b05688c2b3e6c1fULL,
+	0x1f83d9abfb41bd6bULL,
+	0x5be0cd19137e2179ULL
+};
+
+__device__ __constant__
+const uint64_t c_u512[16] =
+{
+	0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
+	0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
+	0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
+	0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
+	0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
+	0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
+	0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
+	0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
+};
+
+#define G(a,b,c,d,x) { \
+	uint32_t idx1 = c_sigma[i][x]; \
+	uint32_t idx2 = c_sigma[i][x+1]; \
+	v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
+	v[d] = ROTR64(v[d] ^ v[a], 32); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 25); \
+	v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \
+	v[d] = ROTR64(v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR64(v[b] ^ v[c], 11); \
+}
+
+// Hash-Padding
+__device__ __constant__
+static const uint64_t d_constHashPadding[8] = {
+	0x0000000000000080ull,
+	0,
+	0,
+	0,
+	0,
+	0x0100000000000000ull,
+	0,
+	0x0002000000000000ull
+};
+
+#if 0
+
+__device__ __constant__
+static const uint64_t __align__(32) c_Padding[16] = {
+	0, 0, 0, 0,
+	0x80000000ULL, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 1, 0, 640,
+};
+
+__device__ static
+void pentablake_compress(uint64_t *h, const uint64_t *block, const uint32_t T0)
+{
+	uint64_t v[16], m[16];
+
+	m[0] = block[0];
+	m[1] = block[1];
+	m[2] = block[2];
+	m[3] = block[3];
+
+	for (uint32_t i = 4; i < 16; i++) {
+		m[i] = (T0 == 0x200) ? block[i] : c_Padding[i];
+	}
+
+	//#pragma unroll 8
+	for(uint32_t i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = c_u512[0];
+	v[ 9] = c_u512[1];
+	v[10] = c_u512[2];
+	v[11] = c_u512[3];
+
+	v[12] = xor1(c_u512[4], T0);
+	v[13] = xor1(c_u512[5], T0);
+	v[14] = c_u512[6];
+	v[15] = c_u512[7];
+
+	for (uint32_t i = 0; i < 16; i++) {
+		/* column step */
+		G(0, 4, 0x8, 0xC, 0x0);
+		G(1, 5, 0x9, 0xD, 0x2);
+		G(2, 6, 0xA, 0xE, 0x4);
+		G(3, 7, 0xB, 0xF, 0x6);
+		/* diagonal step */
+		G(0, 5, 0xA, 0xF, 0x8);
+		G(1, 6, 0xB, 0xC, 0xA);
+		G(2, 7, 0x8, 0xD, 0xC);
+		G(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	//#pragma unroll 16
+	for (uint32_t i = 0; i < 16; i++) {
+		uint32_t j = i % 8;
+		h[j] ^= v[i];
+	}
+}
+
+__global__
+void pentablake_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nounce = startNounce + thread;
+		uint64_t h[8];
+
+		#pragma unroll
+		for(int i=0; i<8; i++) {
+			h[i] = c_IV512[i];
+		}
+
+		uint64_t ending[4];
+		ending[0] = c_data[16];
+		ending[1] = c_data[17];
+		ending[2] = c_data[18];
+		ending[3] = nounce; /* our tested value */
+
+		pentablake_compress(h, ending, 640);
+
+		// -----------------------------------
+
+		for (int r = 0; r < 4; r++) {
+			uint64_t data[8];
+			for (int i = 0; i < 7; i++) {
+				data[i] = h[i];
+			}
+			pentablake_compress(h, data, 512); /* todo: use h,h when ok*/
+		}
+	}
+}
+#endif
+
+__device__ static
+void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0)
+{
+	uint64_t v[16], m[16], i;
+
+	#pragma unroll 16
+	for(i = 0; i < 16; i++) {
+		m[i] = cuda_swab64(block[i]);
+	}
+
+	#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = c_u512[0];
+	v[ 9] = c_u512[1];
+	v[10] = c_u512[2];
+	v[11] = c_u512[3];
+	v[12] = c_u512[4] ^ T0;
+	v[13] = c_u512[5] ^ T0;
+	v[14] = c_u512[6];
+	v[15] = c_u512[7];
+
+	//#pragma unroll 16
+	for( i = 0; i < 16; i++)
+	{
+		/* column step */
+		G(0, 4, 0x8, 0xC, 0x0);
+		G(1, 5, 0x9, 0xD, 0x2);
+		G(2, 6, 0xA, 0xE, 0x4);
+		G(3, 7, 0xB, 0xF, 0x6);
+		/* diagonal step */
+		G(0, 5, 0xA, 0xF, 0x8);
+		G(1, 6, 0xB, 0xC, 0xA);
+		G(2, 7, 0x8, 0xD, 0xC);
+		G(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	//#pragma unroll 16
+	for (i = 0; i < 16; i++) {
+		uint32_t idx = i % 8;
+		h[idx] ^= v[i];
+	}
+}
+
+__global__
+void pentablake_gpu_hash_80(int threads, const uint32_t startNounce, void *outputHash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t h[8];
+		uint64_t buf[16];
+		uint32_t nounce = startNounce + thread;
+
+		//#pragma unroll 8
+		for(int i=0; i<8; i++)
+			h[i] = c_IV512[i];
+
+		//#pragma unroll 16
+		for (int i=0; i < 16; i++)
+			buf[i] = c_data[i];
+
+		// The test Nonce
+		((uint32_t*)buf)[19] = cuda_swab32(nounce);
+
+		pentablake_compress(h, buf, 640ULL);
+
+#if __CUDA_ARCH__ < 300
+		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
+		#pragma unroll 8
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[2*i]   = cuda_swab32( _HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
+		}
+#else
+		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[i] = cuda_swab64( h[i] );
+		}
+#endif
+
+	}
+}
+
+__host__
+void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce, uint32_t *d_outputHash, int order)
+{
+	const int threadsperblock = TPB;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	pentablake_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
+
+	//MyStreamSynchronize(NULL, order, thr_id);
+	cudaDeviceSynchronize();
+}
+
+
+__global__
+void pentablake_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		uint64_t *inpHash = &g_hash[thread<<3]; // hashPosition * 8
+		uint64_t buf[16]; // 128 Bytes
+		uint64_t h[8]; // State
+
+		#pragma unroll 8
+		for (int i=0; i<8; i++)
+			h[i] = c_IV512[i];
+
+		// Message for first round
+		#pragma unroll 8
+		for (int i=0; i < 8; ++i)
+			buf[i] = inpHash[i];
+
+		#pragma unroll 8
+		for (int i=0; i < 8; i++)
+			buf[i+8] = d_constHashPadding[i];
+
+		// Ending round
+		pentablake_compress(h, buf, 512);
+
+#if __CUDA_ARCH__ < 300
+		uint32_t *outHash = (uint32_t*)&g_hash[thread<<3];
+		#pragma unroll 8
+		for (int i=0; i < 8; i++) {
+			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
+		}
+#else
+		uint64_t *outHash = &g_hash[thread<<3];
+		for (int i=0; i < 8; i++) {
+			outHash[i] = cuda_swab64(h[i]);
+		}
+#endif
+	}
+}
+
+__host__
+void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+{
+	const int threadsperblock = TPB;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	pentablake_gpu_hash_64 <<<grid, block, shared_size>>> (threads, startNounce, (uint64_t*)d_outputHash);
+
+	//MyStreamSynchronize(NULL, order, thr_id);
+	cudaDeviceSynchronize();
+}
+
+#if 0
+
+__host__
+uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
+{
+	const int threadsperblock = TPB;
+	uint32_t result = MAXU;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	pentablake_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id]);
+	cudaDeviceSynchronize();
+	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		cudaThreadSynchronize();
+		result = h_resNounce[thr_id][0];
+		extra_results[0] = h_resNounce[thr_id][1];
+	}
+	return result;
+}
+#endif
+
+__global__
+void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+		uint32_t *inpHash = &g_hash[thread<<4];
+		uint32_t h[8];
+
+		#pragma unroll 8
+		for (int i=0; i < 8; i++)
+			h[i] = inpHash[i];
+
+		for (int i = 7; i >= 0; i--) {
+			uint32_t hash = h[i]; // cuda_swab32(h[i]);
+			if (hash > c_Target[i]) {
+				return;
+			}
+			if (hash < c_Target[i]) {
+				break;
+			}
+		}
+
+		/* keep the smallest nounce, + extra one if found */
+		if (resNounce[0] > nounce) {
+			resNounce[1] = resNounce[0];
+			resNounce[0] = nounce;
+		}
+		else
+			resNounce[1] = nounce;
+	}
+}
+
+__host__ static
+uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
+{
+	const int threadsperblock = TPB;
+	uint32_t result = MAXU;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+	size_t shared_size = 0;
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	pentablake_gpu_check_hash <<<grid, block, shared_size>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		cudaThreadSynchronize();
+		result = h_resNounce[thr_id][0];
+		extra_results[0] = h_resNounce[thr_id][1];
+	}
+	return result;
+}
+
+
+__host__
+void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
+{
+	uint8_t data[128];
+	memcpy((void*) data, (void*) pdata, 80);
+	memset(data+80, 0, 48);
+
+	// to swab...
+	data[80] = 0x80;
+	data[111] = 1;
+	data[126] = 0x02;
+	data[127] = 0x80;
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
+}
+
+extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+	static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+	uint32_t throughput = min(128 * 2560, max_nonce - first_nonce);
+	uint32_t endiandata[20];
+	int rc = 0;
+
+	if (extra_results[0] != MAXU) {
+		// possible extra result found in previous call
+		if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
+			pdata[19] = extra_results[0];
+			*hashes_done = pdata[19] - first_nonce + 1;
+			extra_results[0] = MAXU;
+			rc = 1;
+			goto exit_scan;
+		}
+	}
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000F;
+
+	if (!init[thr_id]) {
+		if (opt_n_threads > 1) {
+			CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+		}
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
+		CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	pentablake_cpu_setBlock_80(endiandata, ptarget);
+
+	do {
+		int order = 0;
+
+		// GPU HASH
+		pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		if (foundNonce != MAXU)
+		{
+			uint32_t vhashcpu[8];
+			uint32_t Htarg = ptarget[7];
+
+			be32enc(&endiandata[19], foundNonce);
+
+			pentablakehash(vhashcpu, endiandata);
+
+			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
+			{
+				pdata[19] = foundNonce;
+				rc = 1;
+
+				// Rare but possible if the throughput is big
+				be32enc(&endiandata[19], extra_results[0]);
+				pentablakehash(vhashcpu, endiandata);
+				if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
+					applog(LOG_NOTICE, "GPU found more than one result yippee!");
+					rc = 2;
+				} else {
+					extra_results[0] = MAXU;
+				}
+
+				goto exit_scan;
+			}
+			else if (vhashcpu[7] > Htarg) {
+				applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
+			}
+			else if (vhashcpu[6] > ptarget[6]) {
+				applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]);
+			}
+			else {
+				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+exit_scan:
+	*hashes_done = pdata[19] - first_nonce + 1;
+#if 0
+	/* reset the device to allow multiple instances
+	 * could be made in cpu-miner... check later if required */
+	if (opt_n_threads == 1) {
+		CUDA_SAFE_CALL(cudaDeviceReset());
+		init[thr_id] = false;
+	}
+#endif
+
+	cudaDeviceSynchronize();
+	return rc;
+}
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -50,59 +50,60 @@ const uint64_t c_u512[16] =
  0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };

-#define G(a,b,c,d,e)          \
-    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = ROTR( v[d] ^ v[a],32);        \
-    v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],25);        \
-    v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
-    v[d] = ROTR( v[d] ^ v[a],16);        \
-    v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],11);
-
+#define G(a,b,c,d,x) { \
+	uint32_t idx1 = sigma[i][x]; \
+	uint32_t idx2 = sigma[i][x+1]; \
+	v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = ROTR( v[d] ^ v[a], 32); \
+	v[c] += v[d]; \
+	v[b] = ROTR( v[b] ^ v[c], 25); \
+	v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = ROTR( v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR( v[b] ^ v[c], 11); \
+}

 __device__ static
-void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
+void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
 {
    uint64_t v[16], m[16], i;

-#pragma unroll 16
-    for( i = 0; i < 16; ++i ) {
-        m[i] = cuda_swab64(block[i]);
-    }
+	#pragma unroll 16
+	for( i = 0; i < 16; i++) {
+		m[i] = cuda_swab64(block[i]);
+	}

-#pragma unroll 8
-    for( i = 0; i < 8; ++i )  v[i] = h[i];
+	#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		v[i] = h[i];

-    v[ 8] = u512[0];
-    v[ 9] = u512[1];
-    v[10] = u512[2];
-    v[11] = u512[3];
-    v[12] = u512[4];
-    v[13] = u512[5];
-    v[14] = u512[6];
-    v[15] = u512[7];
+	v[ 8] = u512[0];
+	v[ 9] = u512[1];
+	v[10] = u512[2];
+	v[11] = u512[3];
+	v[12] = u512[4] ^ T0;
+	v[13] = u512[5] ^ T0;
+	v[14] = u512[6];
+	v[15] = u512[7];

-    v[12] ^= bits;
-    v[13] ^= bits;
+	//#pragma unroll 16
+	for( i = 0; i < 16; ++i )
+	{
+		/* column step */
+		G( 0, 4, 8, 12, 0 );
+		G( 1, 5, 9, 13, 2 );
+		G( 2, 6, 10, 14, 4 );
+		G( 3, 7, 11, 15, 6 );
+		/* diagonal step */
+		G( 0, 5, 10, 15, 8 );
+		G( 1, 6, 11, 12, 10 );
+		G( 2, 7, 8, 13, 12 );
+		G( 3, 4, 9, 14, 14 );
+	}

-//#pragma unroll 16
-    for( i = 0; i < 16; ++i )
-    {
-        /* column step */
-        G( 0, 4, 8, 12, 0 );
-        G( 1, 5, 9, 13, 2 );
-        G( 2, 6, 10, 14, 4 );
-        G( 3, 7, 11, 15, 6 );
-        /* diagonal step */
-        G( 0, 5, 10, 15, 8 );
-        G( 1, 6, 11, 12, 10 );
-        G( 2, 7, 8, 13, 12 );
-        G( 3, 4, 9, 14, 14 );
-    }
-
-#pragma unroll 16
-    for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
+	#pragma unroll 16
+	for( i = 0; i < 16; ++i )
+		h[i % 8] ^= v[i];
 }

 __device__ __constant__
@ -114,7 +115,8 @@ static const uint64_t d_constMem[8] = {
 	0x510e527fade682d1ULL,
 	0x9b05688c2b3e6c1fULL,
 	0x1f83d9abfb41bd6bULL,
-	0x5be0cd19137e2179ULL };
+	0x5be0cd19137e2179ULL
+};

 // Hash-Padding
 __device__ __constant__
@ -126,7 +128,8 @@ static const uint64_t d_constHashPadding[8] = {
 	0,
 	0x0100000000000000ull,
 	0,
-	0x0002000000000000ull };
+	0x0002000000000000ull
+};

 __global__ __launch_bounds__(256, 4)
 void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
@ -145,48 +148,42 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
 	if (thread < threads)
 #endif
 	{
-		uint8_t i;
-		// bestimme den aktuellen Zähler
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

 		int hashPosition = nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8

-		// 128 Byte für die Message
+		// 128 Bytes
 		uint64_t buf[16];

-		// State vorbereiten
+		// State
 		uint64_t h[8];
 		#pragma unroll 8
-		for (i=0;i<8;i++)
+		for (int i=0;i<8;i++)
 			h[i] = d_constMem[i];

-		// Message für die erste Runde in Register holen
+		// Message for first round
 		#pragma unroll 8
-		for (i=0; i < 8; ++i)
+		for (int i=0; i < 8; ++i)
 			buf[i] = inpHash[i];

 		#pragma unroll 8
-		for (i=0; i < 8; i++)
+		for (int i=0; i < 8; i++)
 			buf[i+8] = d_constHashPadding[i];

-		// die einzige Hashing-Runde
+		// Ending round
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );

-#if __CUDA_ARCH__ >= 130
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
+#if __CUDA_ARCH__ <= 350
 		uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
 		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
+		for (int i=0; i < 8; i++) {
 			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = &g_hash[8 * hashPosition];
-		#pragma unroll 8
-		for (i=0; i < 8; ++i)
-		{
+		for (int i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64(h[i]);
 		}
 #endif
@ -198,45 +195,38 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// State vorbereiten
 		uint64_t h[8];
-		// 128 Byte für die Message
 		uint64_t buf[16];
-		uint8_t i;
-		// bestimme den aktuellen Zähler
 		uint32_t nounce = startNounce + thread;

 		#pragma unroll 8
-		for(i=0;i<8;i++)
+		for(int i=0; i<8; i++)
 			h[i] = d_constMem[i];

 		// Message für die erste Runde in Register holen
 		#pragma unroll 16
-		for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
+		for (int i=0; i < 16; ++i)
+			buf[i] = c_PaddedMessage80[i];

-		// die Nounce durch die thread-spezifische ersetzen
-		buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
+		// The test Nonce
+		((uint32_t*)buf)[19] = cuda_swab32(nounce);

-		// die einzige Hashing-Runde
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 640 );

-		// Hash rauslassen
-#if __CUDA_ARCH__ >= 130
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
+#if __CUDA_ARCH__ <= 350
 		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
 		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[2*i]   = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
-		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
+		for (uint32_t i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64( h[i] );
 		}
 #endif
+
 	}
 }

--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -6,12 +6,12 @@ extern "C"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
+}
+
 #include "miner.h"

 #include "cuda_helper.h"
-}

-// aus cpu-miner.c
 extern int device_map[8];

 // Speicher für Input/Output der verketteten Hashfunktionen
@ -70,76 +70,64 @@ extern "C" void quarkhash(void *state, const void *input)
    unsigned char hash[64];

    sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
    sph_blake512 (&ctx_blake, input, 80);
    sph_blake512_close(&ctx_blake, (void*) hash);
    
    sph_bmw512_init(&ctx_bmw);
-    // ZBMW;
    sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
    sph_bmw512_close(&ctx_bmw, (void*) hash);

    if (hash[0] & 0x8)
    {
        sph_groestl512_init(&ctx_groestl);
-        // ZGROESTL;
        sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
        sph_groestl512_close(&ctx_groestl, (void*) hash);
    }
    else
    {
        sph_skein512_init(&ctx_skein);
-        // ZSKEIN;
        sph_skein512 (&ctx_skein, (const void*) hash, 64);
        sph_skein512_close(&ctx_skein, (void*) hash);
    }
    
    sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
    sph_groestl512_close(&ctx_groestl, (void*) hash);

    sph_jh512_init(&ctx_jh);
-    // ZJH;
    sph_jh512 (&ctx_jh, (const void*) hash, 64);
    sph_jh512_close(&ctx_jh, (void*) hash);

    if (hash[0] & 0x8)
    {
        sph_blake512_init(&ctx_blake);
-        // ZBLAKE;
        sph_blake512 (&ctx_blake, (const void*) hash, 64);
        sph_blake512_close(&ctx_blake, (void*) hash);
    }
    else
    {
        sph_bmw512_init(&ctx_bmw);
-        // ZBMW;
        sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
        sph_bmw512_close(&ctx_bmw, (void*) hash);
    }

    sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
    sph_keccak512_close(&ctx_keccak, (void*) hash);

    sph_skein512_init(&ctx_skein);
-    // SKEIN;
    sph_skein512 (&ctx_skein, (const void*) hash, 64);
    sph_skein512_close(&ctx_skein, (void*) hash);

    if (hash[0] & 0x8)
    {
        sph_keccak512_init(&ctx_keccak);
-        // ZKECCAK;
        sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
        sph_keccak512_close(&ctx_keccak, (void*) hash);
    }
    else
    {
        sph_jh512_init(&ctx_jh);
-        // ZJH;
        sph_jh512 (&ctx_jh, (const void*) hash, 64);
        sph_jh512_close(&ctx_jh, (void*) hash);
    }
@ -147,23 +135,17 @@ extern "C" void quarkhash(void *state, const void *input)
    memcpy(state, hash, 32);
 }

-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
    unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
+	const int throughput = 256*4096; // 100;
+	static bool init[8] = {0,0,0,0,0,0,0,0};

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		((uint32_t*)ptarget)[7] = 0x00FF;

-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*4096; // 100;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
@ -252,11 +234,12 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			quarkhash(vhash64, endiandata);

-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {

 				pdata[19] = foundNonce;
 				*hashes_done = (foundNonce - first_nonce + 1)/2;
--- a/util.c
+++ b/util.c
@ -1020,7 +1020,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	int merkle_count, i;
 	json_t *merkle_arr;
 	unsigned char **merkle;
-	int ntime;
+	int ntime, hoffset;

 	job_id = json_string_value(json_array_get(params, 0));
 	prevhash = json_string_value(json_array_get(params, 1));
@ -1078,7 +1078,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
 	memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);

-	sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + 43);
+	hoffset = coinb1_size - 15; // 43;
+	sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + hoffset);
 	if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
 		memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
 	hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);
@ -1125,7 +1126,7 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
 	sctx->next_diff = diff;
 	pthread_mutex_unlock(&sctx->work_lock);

-	applog(LOG_INFO, "Stratum difficulty set to %g", diff);
+	applog(LOG_WARNING, "Stratum difficulty set to %g", diff);

 	return true;
 }
@ -1221,10 +1222,6 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
 	id = json_object_get(val, "id");
 	params = json_object_get(val, "params");

-	if (opt_debug_rpc) {
-		applog(LOG_DEBUG, "method: %s", s);
-	}
-
 	if (!strcasecmp(method, "mining.notify")) {
 		ret = stratum_notify(sctx, params);
 		goto out;
@ -1400,6 +1397,20 @@ static char* format_hash(char* buf, unsigned char *hash)
 	return buf;
 }

+/* to debug diff in data */
+extern void applog_compare_hash(unsigned char *hash, unsigned char *hash2)
+{
+	char s[256] = "";
+	int len = 0;
+	for (int i=0; i < 32; i += 4) {
+		char *color = memcmp(hash+i, hash2+i, 4) ? CL_RED : CL_GRY;
+		len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color,
+			hash[i], hash[i+1], hash[i+2], hash[i+3]);
+		s[len] = '\0';
+	}
+	applog(LOG_DEBUG, "%s", s);
+}
+
 extern void applog_hash(unsigned char *hash)
 {
 	char s[128] = {'\0'};
@ -1457,6 +1468,10 @@ void print_hash_tests(void)
 	nist5hash(&hash[0], &buf[0]);
 	printpfx("nist5", hash);

+	memset(hash, 0, sizeof hash);
+	pentablakehash(&hash[0], &buf[0]);
+	printpfx("pentablake", hash);
+
 	memset(hash, 0, sizeof hash);
 	quarkhash(&hash[0], &buf[0]);
 	printpfx("quark", hash);
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -21,10 +21,9 @@ extern "C"
 #include <memory.h>
 }

-// aus cpu-miner.c
+// in cpu-miner.c
 extern int device_map[8];

-// Speicher für Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
@ -140,22 +139,17 @@ extern "C" void x11hash(void *output, const void *input)
 }


-extern bool opt_benchmark;
-
 extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
    unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
+	const int throughput = 256*256*8;
+	static bool init[8] = {0,0,0,0,0,0,0,0};

 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;

-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*256*8;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
@ -186,8 +180,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	cuda_check_cpu_setTarget(ptarget);

 	do {
-		uint32_t foundNonce;
+		const uint32_t Htarg = ptarget[7];
+
 		int order = 0;
+		uint32_t foundNonce;

 		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -204,7 +200,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,

 		// Scan nach Gewinner Hashes auf der GPU
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -20,11 +20,11 @@ extern "C" {
 #include "sph/sph_hamsi.h"
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
+}

 #include "miner.h"

 #include "cuda_helper.h"
-}

 // from cpu-miner.c
 extern int device_map[8];
@ -167,8 +167,6 @@ extern "C" void x14hash(void *output, const void *input)
 }


-extern bool opt_benchmark;
-
 extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -21,14 +21,11 @@ extern "C" {
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
 #include "sph/sph_whirlpool.h"
+}

 #include "miner.h"

 #include "cuda_helper.h"
-}
-
-// to test gpu hash on a null buffer
-#define NULLTEST 0

 // from cpu-miner.c
 extern int device_map[8];
@ -92,8 +89,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
 											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);

-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 // X15 CPU Hash function
 extern "C" void x15hash(void *output, const void *input)
 {
@ -181,17 +176,6 @@ extern "C" void x15hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-#if NULLTEST
-static void print_hash(unsigned char *hash)
-{
-	for (int i=0; i < 32; i += 4) {
-		printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
-	}
-}
-#endif
-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
@ -203,12 +187,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	uint32_t Htarg = ptarget[7];

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
-
-#if NULLTEST
-	for (int k=0; k < 20; k++)
-		pdata[k] = 0;
-#endif
+		((uint32_t*)ptarget)[7] = Htarg = 0x00FF;

 	if (!init[thr_id])
 	{
@ -259,12 +238,6 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-#if NULLTEST
-		uint32_t buf[8]; memset(buf, 0, sizeof buf);
-		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
-		CUDA_SAFE_CALL(cudaThreadSynchronize());
-		print_hash((unsigned char*)buf); printf("\n");
-#endif
 		/* Scan with GPU */
 		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -26,17 +26,15 @@ extern "C"

 #include "sph/sph_sha2.h"
 #include "sph/sph_haval.h"
+}

 #include "miner.h"
-}
+#include "cuda_helper.h"

 static uint32_t *d_hash[8];

-
-// cpu-miner.c
+// in cpu-miner.c
 extern int device_map[8];
-extern bool opt_benchmark;
-

 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -204,20 +202,12 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
 	const int throughput = 256*256*8;
-
-		if (opt_benchmark)
-				((uint32_t*)ptarget)[7] = 0x0000ff;
-
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	uint32_t Htarg = ptarget[7];

 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
+		((uint32_t*)ptarget)[7] = Htarg = 0x00FF;

 	if (!init[thr_id])
 	{