Add zr5 algo (for SM 3.5+)

uint4 copy + keccak cleanup, groestl: small uint4 opt Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
10 years ago · a37e909db9
12 changed files with 1110 additions and 527 deletions
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
--- a/Makefile.am
+++ b/Makefile.am
@ -41,7 +41,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
 			  quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu  \
-			  cuda_nist5.cu pentablake.cu \
+			  cuda_nist5.cu pentablake.cu zr5.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner release 1.6.0-tpruvot (Mar 2015) - "Pluck & Whirlpoolx"
+ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX"
 ---------------------------------------------------------------
 ***************************************************************
@ -37,8 +37,9 @@ BlakeCoin (256 8-rounds)
 Keccak (Maxcoin)
 Deep, Doom and Qubit
 Pentablake (Blake 512 x5)
-S3 (OneCoin)
+1Coin Triple S
-Lyra2RE (new VertCoin algo)
+Vertcoin Lyra2RE
 Ziftrcoin (ZR5)
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL Only) implementations.
@ -84,6 +85,7 @@ its command line interface and options.
                          x14         use to mine X14Coin
                          x15         use to mine Halcyon
                          x17         use to mine X17
                          zr5         use to mine ZiftrCoin
  -d, --devices         gives a comma separated list of CUDA device IDs
                        to operate on. Device IDs start counting from 0!
@ -183,12 +185,12 @@ features.
 >>> RELEASE HISTORY <<<
-  Mar. 2015       v1.6.0 (Note for CryptoMiningBlog: NOT YET RELEASED/FINISHED!)
+  Mar. 27th 2015  v1.6.0
                  Add the ZR5 Algo for Ziftcoin
                  Import pluck (djm34) and whirlpoolx (alexis78) algos
                  Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs)
                  Default config file (also help to debug without command line)
                  Various small fixes
                  More to come soon...
  Feb. 11th 2015  v1.5.3
                  Fix anime algo
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -108,6 +108,7 @@ enum sha_algos {
 	ALGO_X14,
 	ALGO_X15,
 	ALGO_X17,
 	ALGO_ZR5,
 };
 static const char *algo_names[] = {
@ -140,6 +141,7 @@ static const char *algo_names[] = {
 	"x14",
 	"x15",
 	"x17",
 	"zr5",
 };
 bool opt_debug = false;
@ -166,7 +168,7 @@ static const bool opt_time = true;
 static enum sha_algos opt_algo = ALGO_X11;
 int opt_n_threads = 0;
 int opt_affinity = -1;
-int opt_priority = 3;
+int opt_priority = 0;
 static double opt_difficulty = 1; // CH
 bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
@ -193,6 +195,7 @@ int api_thr_id = -1;
 bool stratum_need_reset = false;
 struct work_restart *work_restart = NULL;
 struct stratum_ctx stratum = { 0 };
 uint32_t zr5_pok = 0;
 pthread_mutex_t applog_lock;
 static pthread_mutex_t stats_lock;
@ -254,6 +257,7 @@ Options:\n\
 			x17         X17 (peoplecurrency)\n\
 			whirl       Whirlcoin (old whirlpool)\n\
 			whirlpoolx  Vanilla coin\n\
 			zr5         ZR5 (ZiftrCoin)\n\
  -d, --devices         Comma separated list of CUDA devices to use.\n\
                        Device IDs start counting from 0! Alternatively takes\n\
                        string names of your cards like gtx780ti or gt640#2\n\
@ -472,6 +476,10 @@ static bool work_decode(const json_t *val, struct work *work)
 	int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target);
 	int i;
 	if (opt_algo == ALGO_ZR5) {
 		data_size = 80; adata_sz = 20;
 	}
 	if (unlikely(!jobj_binary(val, "data", work->data, data_size))) {
 		applog(LOG_ERR, "JSON inval data");
 		return false;
@ -564,12 +572,12 @@ static int share_result(int result, const char *reason)
 	if (reason) {
 		applog(LOG_WARNING, "reject reason: %s", reason);
-		if (strncmp(reason, "low difficulty share", 20) == 0) {
+		if (strncasecmp(reason, "low difficulty", 14) == 0) {
 			opt_difficulty = (opt_difficulty * 2.0) / 3.0;
 			applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty);
 			return 0;
 		}
-		if (strncmp(reason, "Duplicate share", 15) == 0 && !check_dups) {
+		if (strncasecmp(reason, "duplicate", 9) == 0 && !check_dups) {
 			applog(LOG_WARNING, "enabling duplicates check feature");
 			check_dups = true;
 		}
@ -603,7 +611,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		}
 	}
-	if (stale_work) {
+	if (opt_algo == ALGO_ZR5 && !stale_work) {
 		stale_work = (memcmp(&work->data[1], &g_work.data[1], 68));
 	}
 	if (!submit_old && stale_work) {
 		if (opt_debug)
 			applog(LOG_WARNING, "stale work detected, discarding");
 		return true;
@ -616,9 +628,16 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		uint16_t nvote;
 		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
-		le32enc(&ntime, work->data[17]);
+		switch (opt_algo) {
-		le32enc(&nonce, work->data[19]);
+		case ALGO_ZR5:
-
+			check_dups = true;
 			be32enc(&ntime, work->data[17]);
 			be32enc(&nonce, work->data[19]);
 			break;
 		default:
 			le32enc(&ntime, work->data[17]);
 			le32enc(&nonce, work->data[19]);
 		}
 		noncestr = bin2hex((const uchar*)(&nonce), 4);
 		if (check_dups)
@ -666,14 +685,21 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 	} else {
 		int data_size = sizeof(work->data);
 		int adata_sz = ARRAY_SIZE(work->data);
 		/* build hex string */
 		char *str = NULL;
 		if (opt_algo == ALGO_ZR5) {
 			data_size = 80; adata_sz = 20;
 		}
 		if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
-			for (int i = 0; i < ARRAY_SIZE(work->data); i++)
+			for (int i = 0; i < adata_sz; i++)
 				le32enc(work->data + i, work->data[i]);
 		}
-		str = bin2hex((uchar*)work->data, sizeof(work->data));
+		str = bin2hex((uchar*)work->data, data_size);
 		if (unlikely(!str)) {
 			applog(LOG_ERR, "submit_upstream_work OOM");
 			return false;
@ -1098,10 +1124,18 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
 	work->data[17] = le32dec(sctx->job.ntime);
 	work->data[18] = le32dec(sctx->job.nbits);
-	if (opt_algo == ALGO_MJOLLNIR || opt_algo == ALGO_HEAVY)
+
-	{
+	switch (opt_algo) {
 	case ALGO_MJOLLNIR:
 	case ALGO_HEAVY:
 		// todo: check if 19 is enough
 		for (i = 0; i < 20; i++)
 			work->data[i] = be32dec((uint32_t *)&work->data[i]);
 		break;
 	case ALGO_ZR5:
 		for (i = 0; i < 19; i++)
 			work->data[i] = be32dec((uint32_t *)&work->data[i]);
 		break;
 	}
 	work->data[20] = 0x80000000;
@ -1227,6 +1261,7 @@ static void *miner_thread(void *userdata)
 		// &work.data[19]
 		int wcmplen = 76;
 		int wcmpoft = 0;
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 		if (have_stratum) {
@ -1284,7 +1319,14 @@ static void *miner_thread(void *userdata)
 					hashlog_purge_job(work.job_id);
 			}
 		}
-		if (memcmp(work.data, g_work.data, wcmplen)) {
+
 		if (opt_algo == ALGO_ZR5) {
 			// ignore pok/version header
 			wcmpoft = 1;
 			wcmplen -= 4;
 		}
 		if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
 			#if 0
 			if (opt_debug) {
 				for (int n=0; n <= (wcmplen-8); n+=8) {
@ -1497,19 +1539,24 @@ static void *miner_thread(void *userdata)
 		case ALGO_X14:
 			rc = scanhash_x14(thr_id, work.data, work.target,
-				                  max_nonce, &hashes_done);
+				              max_nonce, &hashes_done);
 			break;
 		case ALGO_X15:
 			rc = scanhash_x15(thr_id, work.data, work.target,
-				                  max_nonce, &hashes_done);
+				              max_nonce, &hashes_done);
 			break;
 		case ALGO_X17:
 			rc = scanhash_x17(thr_id, work.data, work.target,
-				                  max_nonce, &hashes_done);
+				              max_nonce, &hashes_done);
 			break;
 		case ALGO_ZR5: {
 			rc = scanhash_zr5(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		}
 		default:
 			/* should never happen */
 			goto out;
@ -1606,6 +1653,11 @@ static void *miner_thread(void *userdata)
 			if (rc > 1 && work.data[21]) {
 				work.data[19] = work.data[21];
 				work.data[21] = 0;
 				if (opt_algo == ALGO_ZR5) {
 					// todo: use + 4..6 index for pok to allow multiple nonces
 					work.data[0] = work.data[22]; // pok
 					work.data[22] = 0;
 				}
 				if (!submit_work(mythr, &work))
 					break;
 			}
@ -1675,10 +1727,10 @@ start:
 			submit_old = soval ? json_is_true(soval) : false;
 			pthread_mutex_lock(&g_work_lock);
 			if (work_decode(json_object_get(val, "result"), &g_work)) {
 				restart_threads();
 				if (!opt_quiet)
 					applog(LOG_BLUE, "%s detected new block", short_url);
 				g_work_time = time(NULL);
 				restart_threads();
 			}
 			pthread_mutex_unlock(&g_work_lock);
 			json_decref(val);
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -105,8 +105,9 @@
      <CInterleavedPTX>false</CInterleavedPTX>
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>false</Keep>
+      <Keep>true</Keep>
-      <CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50</CodeGeneration>
      <GenerateLineInfo>true</GenerateLineInfo>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@ -176,8 +177,8 @@
      <CInterleavedPTX>false</CInterleavedPTX>
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>false</Keep>
+      <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
      <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
      <Defines>
      </Defines>
@ -221,7 +222,7 @@
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>false</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
      <Include>
      </Include>
      <TargetMachinePlatform>64</TargetMachinePlatform>
@ -355,6 +356,7 @@
    </CudaCompile>
    <CudaCompile Include="cuda_nist5.cu">
    </CudaCompile>
    <CudaCompile Include="zr5.cu" />
    <CudaCompile Include="groestl_functions_quad.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
    </CudaCompile>
@ -509,4 +511,4 @@
  <Target Name="AfterClean">
    <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
  </Target>
-</Project>
+</Project>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -379,9 +379,6 @@
    <CudaCompile Include="cuda_groestlcoin.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="groestl_functions_quad.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="bitslice_transformations_quad.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
@ -571,6 +568,12 @@
    <CudaCompile Include="lyra2\lyra2RE.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="zr5.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="groestl_functions_quad.cu">
      <Filter>Source Files\CUDA\quark</Filter>
    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <Image Include="res\ccminer.ico">
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.6-git])
+AC_INIT([ccminer], [1.6])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -63,7 +63,7 @@
 #define HAVE_STRING_H 1
 /* Define to 1 if you have the <syslog.h> header file. */
-#define HAVE_SYSLOG_H 1
+/* #undef HAVE_SYSLOG_H */
 /* Define to 1 if you have the <sys/endian.h> header file. */
 /* #undef HAVE_SYS_ENDIAN_H */
@ -108,7 +108,7 @@
 /* #undef LIBCURL_FEATURE_SSPI */
 /* Defined if libcurl supports DICT */
-#define LIBCURL_PROTOCOL_DICT 1
+/* #undef LIBCURL_PROTOCOL_DICT */
 /* Defined if libcurl supports FILE */
 #define LIBCURL_PROTOCOL_FILE 1
@ -123,28 +123,28 @@
 #define LIBCURL_PROTOCOL_HTTP 1
 /* Defined if libcurl supports HTTPS */
-#define LIBCURL_PROTOCOL_HTTPS 1
+/* #undef LIBCURL_PROTOCOL_HTTPS */
 /* Defined if libcurl supports IMAP */
-#define LIBCURL_PROTOCOL_IMAP 1
+/* #undef LIBCURL_PROTOCOL_IMAP */
 /* Defined if libcurl supports LDAP */
-#define LIBCURL_PROTOCOL_LDAP 1
+/* #undef LIBCURL_PROTOCOL_LDAP */
 /* Defined if libcurl supports POP3 */
-#define LIBCURL_PROTOCOL_POP3 1
+/* #undef LIBCURL_PROTOCOL_POP3 */
 /* Defined if libcurl supports RTSP */
-#define LIBCURL_PROTOCOL_RTSP 1
+/* #undef LIBCURL_PROTOCOL_RTSP */
 /* Defined if libcurl supports SMTP */
-#define LIBCURL_PROTOCOL_SMTP 1
+/* #undef LIBCURL_PROTOCOL_SMTP */
 /* Defined if libcurl supports TELNET */
-#define LIBCURL_PROTOCOL_TELNET 1
+/* #undef LIBCURL_PROTOCOL_TELNET */
 /* Defined if libcurl supports TFTP */
-#define LIBCURL_PROTOCOL_TFTP 1
+/* #undef LIBCURL_PROTOCOL_TFTP */
 /* Define to 1 if your C compiler doesn't accept -c and -o together. */
 /* #undef NO_MINUS_C_MINUS_O */
@ -159,16 +159,16 @@
 #define PACKAGE_NAME "ccminer"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 1.6-git"
+#define PACKAGE_STRING "ccminer 1.6"
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
 /* Define to the home page for this package. */
-#define PACKAGE_URL ""
+#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.6-git"
+#define PACKAGE_VERSION "1.6"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
@ -191,10 +191,10 @@
 #define USE_XOP 1
 /* Version number of package */
-#define VERSION "1.6-git"
+#define VERSION "1.6"
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
 /* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
+//#define size_t unsigned int
--- a/miner.h
+++ b/miner.h
@ -378,6 +378,10 @@ extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_zr5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 /* api related */
 void *api_thread(void *userdata);
 void api_set_throughput(int thr_id, uint32_t throughput);
@ -679,6 +683,7 @@ void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
 void x17hash(void *output, const void *input);
 void zr5hash(void *output, const void *input);
 #ifdef __cplusplus
 }
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -52,11 +52,22 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u
        uint32_t hash[16];
        from_bitslice_quad(state, hash);
-        if (thr == 0)
+
-        {
+        // uint4 = 4x4 uint32_t = 16 bytes
        if (thr == 0) {
            uint4 *phash = (uint4*) hash;
            uint4 *outpt = (uint4*) outpHash; /* var kept for hash align */
            outpt[0] = phash[0];
            outpt[1] = phash[1];
            outpt[2] = phash[2];
            outpt[3] = phash[3];
        }
 /*
        if (thr == 0) {
            #pragma unroll
            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
 */
    }
 #endif
 }
--- a/util.cpp
+++ b/util.cpp
@ -1660,12 +1660,35 @@ extern void applog_hash(uchar *hash)
 #define printpfx(n,h) \
 	printf("%s%11s%s: %s\n", CL_GRN, n, CL_N, format_hash(s, h))
 static uint32_t zrtest[20] = {
 	swab32(0x01806486),
 	swab32(0x00000000),
 	swab32(0x00000000),
 	swab32(0x00000000),
 	swab32(0x00000000),
 	swab32(0x00000000),
 	swab32(0x00000000),
 	swab32(0x00000000),
 	swab32(0x00000000),
 	swab32(0x2ab03251),
 	swab32(0x87d4f28b),
 	swab32(0x6e22f086),
 	swab32(0x4845ddd5),
 	swab32(0x0ac4e6aa),
 	swab32(0x22a1709f),
 	swab32(0xfb4275d9),
 	swab32(0x25f26636),
 	swab32(0x300eed54),
 	swab32(0xffff0f1e),
 	swab32(0x2a9e2300),
 };
 void do_gpu_tests(void)
 {
 #ifdef _DEBUG
 	unsigned long done;
 	char s[128] = { '\0' };
-	uchar buf[128];
+	uchar buf[160];
 	uint32_t tgt[8] = { 0 };
 	opt_tracegpu = true;
@ -1674,11 +1697,15 @@ void do_gpu_tests(void)
 	tgt[7] = 0xffff;
 	memset(buf, 0, sizeof buf);
-	scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
+	//memcpy(buf, zrtest, 80);
 	scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done);
 	//memset(buf, 0, sizeof buf);
 	//scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
 	memset(buf, 0, sizeof buf);
 	// buf[0] = 1; buf[64] = 2; // for endian tests
-	scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
+	//scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
 	//memset(buf, 0, sizeof buf);
 	//scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
@ -1688,6 +1715,7 @@ void do_gpu_tests(void)
 	opt_tracegpu = false;
 #endif
 }
 extern "C" void zr5hash_pok(void *output, uint32_t *pdata);
 void print_hash_tests(void)
 {
@ -1782,6 +1810,11 @@ void print_hash_tests(void)
 	x17hash(&hash[0], &buf[0]);
 	printpfx("X17", hash);
 	//memcpy(buf, zrtest, 80);
 	zr5hash(&hash[0], &buf[0]);
 	//zr5hash_pok(&hash[0], (uint32_t*) &buf[0]);
 	printpfx("ZR5", hash);
 	printf("\n");
 	do_gpu_tests();
--- a/zr5.cu
+++ b/zr5.cu
@ -0,0 +1,342 @@
 /* Ziftrcoin ZR5 CUDA Implementation, (c) tpruvot 2015 */
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 #include <stdio.h>
 #include <memory.h>
 #define ZR_BLAKE   0
 #define ZR_GROESTL 1
 #define ZR_JH512   2
 #define ZR_SKEIN   3
 #define POK_BOOL_MASK 0x00008000
 #define POK_DATA_MASK 0xFFFF0000
 static uint32_t* d_hash[MAX_GPUS];
 static uint16_t* d_pokh[MAX_GPUS];
 static uint16_t* h_poks[MAX_GPUS];
 static uint32_t* d_blake[MAX_GPUS];
 static uint32_t* d_groes[MAX_GPUS];
 static uint32_t* d_jh512[MAX_GPUS];
 static uint32_t* d_skein[MAX_GPUS];
 __constant__ uint8_t d_permut[24][4];
 static const uint8_t permut[24][4] = {
 	{0, 1, 2, 3},
 	{0, 1, 3, 2},
 	{0, 2, 1, 3},
 	{0, 2, 3, 1},
 	{0, 3, 1, 2},
 	{0, 3, 2, 1},
 	{1, 0, 2, 3},
 	{1, 0, 3, 2},
 	{1, 2, 0, 3},
 	{1, 2, 3, 0},
 	{1, 3, 0, 2},
 	{1, 3, 2, 0},
 	{2, 0, 1, 3},
 	{2, 0, 3, 1},
 	{2, 1, 0, 3},
 	{2, 1, 3, 0},
 	{2, 3, 0, 1},
 	{2, 3, 1, 0},
 	{3, 0, 1, 2},
 	{3, 0, 2, 1},
 	{3, 1, 0, 2},
 	{3, 1, 2, 0},
 	{3, 2, 0, 1},
 	{3, 2, 1, 0}
 };
 // CPU HASH
 extern "C" void zr5hash(void *output, const void *input)
 {
 	sph_keccak512_context ctx_keccak;
 	sph_blake512_context ctx_blake;
 	sph_groestl512_context ctx_groestl;
 	sph_jh512_context ctx_jh;
 	sph_skein512_context ctx_skein;
 	uchar _ALIGN(64) hash[64];
 	uint32_t *phash = (uint32_t *) hash;
 	uint32_t norder;
 	sph_keccak512_init(&ctx_keccak);
 	sph_keccak512(&ctx_keccak, (const void*) input, 80);
 	sph_keccak512_close(&ctx_keccak, (void*) phash);
 	norder = phash[0] % ARRAY_SIZE(permut); /* % 24 */
 	for(int i = 0; i < 4; i++)
 	{
 		switch (permut[norder][i]) {
 		case ZR_BLAKE:
 			sph_blake512_init(&ctx_blake);
 			sph_blake512(&ctx_blake, (const void*) phash, 64);
 			sph_blake512_close(&ctx_blake, phash);
 			break;
 		case ZR_GROESTL:
 			sph_groestl512_init(&ctx_groestl);
 			sph_groestl512(&ctx_groestl, (const void*) phash, 64);
 			sph_groestl512_close(&ctx_groestl, phash);
 			break;
 		case ZR_JH512:
 			sph_jh512_init(&ctx_jh);
 			sph_jh512(&ctx_jh, (const void*) phash, 64);
 			sph_jh512_close(&ctx_jh, phash);
 			break;
 		case ZR_SKEIN:
 			sph_skein512_init(&ctx_skein);
 			sph_skein512(&ctx_skein, (const void*) phash, 64);
 			sph_skein512_close(&ctx_skein, phash);
 			break;
 		default:
 			break;
 		}
 	}
 	memcpy(output, phash, 32);
 }
 extern "C" void zr5hash_pok(void *output, uint32_t *pdata)
 {
 	const uint32_t version = pdata[0] & (~POK_DATA_MASK);
 	uint32_t _ALIGN(64) hash[8];
 	pdata[0] = version;
 	zr5hash(hash, pdata);
 	// fill PoK
 	pdata[0] = version | (hash[0] & POK_DATA_MASK);
 	zr5hash(hash, pdata);
 	memcpy(output, hash, 32);
 }
 __global__
 void zr5_copy_round_data_gpu(uint32_t threads, uint32_t *d_hash, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, int rnd)
 {
 	// copy 64 bytes hash in the right algo buffer
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint64_t offset = thread * 64 / 4;
 		uint32_t *phash = &d_hash[offset];
 		// algos hash order
 		uint32_t norder = phash[0] % ARRAY_SIZE(permut);
 		uint32_t algo = d_permut[norder][rnd];
 		uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
 		if (rnd > 0) {
 			int algosrc = d_permut[norder][rnd - 1];
 			phash = buffers[algosrc] + offset;
 		}
 		// uint4 = 4x4 uint32_t = 16 bytes
 		uint4 *psrc = (uint4*) phash;
 		uint4 *pdst = (uint4*) (buffers[algo] + offset);
 		pdst[0] = psrc[0];
 		pdst[1] = psrc[1];
 		pdst[2] = psrc[2];
 		pdst[3] = psrc[3];
 	}
 }
 __host__
 void zr5_move_data_to_hash(int thr_id, uint32_t threads, int rnd)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	zr5_copy_round_data_gpu <<<grid, block>>> (threads, d_hash[thr_id], d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], rnd);
 }
 __global__
 void zr5_final_round_data_gpu(uint32_t threads, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, uint32_t *d_hash, uint16_t *d_pokh)
 {
 	// after the 4 algos rounds, copy back hash to d_hash
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint64_t offset = thread * 16; // 64 / 4;
 		uint32_t *phash = &d_hash[offset];
 		uint16_t norder = phash[0] % ARRAY_SIZE(permut);
 		uint16_t algosrc = d_permut[norder][3];
 		uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
 		// copy only hash[0] + hash[6..7]
 		uint2 *psrc = (uint2*) (buffers[algosrc] + offset);
 		uint2 *pdst = (uint2*) phash;
 		pdst[0].x = psrc[0].x;
 		pdst[3] = psrc[3];
 		//phash[7] = *(buffers[algosrc] + offset + 7);
 	}
 }
 __host__
 void zr5_final_round(int thr_id, uint32_t threads)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	zr5_final_round_data_gpu <<<grid, block>>> (threads, d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], d_hash[thr_id], d_pokh[thr_id]);
 }
 extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
 extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
 extern void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks);
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t _ALIGN(64) tmpdata[20];
 	const uint32_t version = pdata[0] & (~POK_DATA_MASK);
 	const uint32_t first_nonce = pdata[19];
 	uint32_t throughput =  device_intensity(thr_id, __func__, 1U << 18);
 	throughput = min(throughput, (1U << 20)-1024);
 	throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 	memcpy(tmpdata, pdata, 80);
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		// hash buffer = keccak hash 64 required
 		cudaMalloc(&d_hash[thr_id], 64 * throughput);
 		cudaMalloc(&d_pokh[thr_id], 2 * throughput);
 		cudaMemcpyToSymbol(d_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);
 		cudaMallocHost(&h_poks[thr_id], 2 * throughput);
 		// data buffers for the 4 rounds
 		cudaMalloc(&d_blake[thr_id], 64 * throughput);
 		cudaMalloc(&d_groes[thr_id], 64 * throughput);
 		cudaMalloc(&d_jh512[thr_id], 64 * throughput);
 		cudaMalloc(&d_skein[thr_id], 64 * throughput);
 		jackpot_keccak512_cpu_init(thr_id, throughput);
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		cuda_check_cpu_init(thr_id, throughput);
 		CUDA_SAFE_CALL(cudaDeviceSynchronize());
 		init[thr_id] = true;
 	}
 	tmpdata[0] = version;
 	jackpot_keccak512_cpu_setBlock((void*)tmpdata, 80);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		int order = 0;
 		// Keccak512 Hash with CUDA
 		zr5_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		for (int rnd=0; rnd<4; rnd++) {
 			zr5_move_data_to_hash(thr_id, throughput, rnd);
 			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
 			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
 			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
 			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
 		}
 		// This generates all pok prefixes
 		zr5_final_round(thr_id, throughput);
 		// Keccak512 pok
 		zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_pokh[thr_id]);
 		for (int rnd=0; rnd<4; rnd++) {
 			zr5_move_data_to_hash(thr_id, throughput, rnd);
 			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
 			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
 			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
 			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
 		}
 		zr5_final_round(thr_id, throughput);
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t vhash64[8];
 			uint32_t oldp0 = pdata[0];
 			uint32_t oldp19 = pdata[19];
 			uint32_t offset = foundNonce - pdata[19];
 			uint32_t pok = 0;
 			*hashes_done = pdata[19] - first_nonce + throughput;
 			cudaMemcpy(h_poks[thr_id], d_pokh[thr_id], 2 * throughput, cudaMemcpyDeviceToHost);
 			pok = version | (0x10000UL * h_poks[thr_id][offset]);
 			pdata[0] = pok; pdata[19] = foundNonce;
 			zr5hash(vhash64, pdata);
 			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				if (secNonce != 0) {
 					offset = secNonce - oldp19;
 					pok = version | (0x10000UL * h_poks[thr_id][offset]);
 					memcpy(tmpdata, pdata, 80);
 					tmpdata[0] = pok; tmpdata[19] = secNonce;
 					zr5hash(vhash64, tmpdata);
 					if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 						pdata[21] = secNonce;
 						pdata[22] = pok;
 						res++;
 					}
 				}
 				return res;
 			} else {
 				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
 				pdata[19]++;
 				pdata[0] = oldp0;
 			}
 		} else
 			pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }