Drop Whirlpool and whirpoolx algos, no more used...

Vanillacoin is now PoS only
10 years ago · 298573b45d
10 changed files with 4 additions and 852 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -54,7 +54,6 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x11/cuda_x11_luffa512_Cubehash.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu x15/whirlpoolx.cu x15/cuda_whirlpoolx.cu \
 			  x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
 			  x11/s3.cu
--- a/README.txt
+++ b/README.txt
@ -89,7 +89,6 @@ its command line interface and options.
                          s3          use to mine 1coin
                          skein       use to mine Skeincoin
                          skein2      use to mine Woodcoin
                          whirlpoolx  use to mine Vanillacoin
                          x11         use to mine DarkCoin
                          x14         use to mine X14Coin
                          x15         use to mine Halcyon
@ -216,6 +215,9 @@ features.
 >>> RELEASE HISTORY <<<
                  Try to compute network difficulty on pools too
                  Drop Whirlpool and whirpoolx algos, no more used...
  May 15th 2015   v1.6.3
                  Import and adapt Neoscrypt from djm34 work (SM 5+ only)
                  Conditional mining options based on gpu temp, network diff and rate
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -109,8 +109,6 @@ enum sha_algos {
 	ALGO_SKEIN,
 	ALGO_SKEIN2,
 	ALGO_S3,
 	ALGO_WHIRLCOIN,
 	ALGO_WHIRLPOOLX,
 	ALGO_X11,
 	ALGO_X13,
 	ALGO_X14,
@ -147,8 +145,6 @@ static const char *algo_names[] = {
 	"skein",
 	"skein2",
 	"s3",
 	"whirl",
 	"whirlpoolx",
 	"x11",
 	"x13",
 	"x14",
@ -1250,7 +1246,6 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_GROESTL:
 		case ALGO_KECCAK:
 		case ALGO_BLAKECOIN:
 		case ALGO_WHIRLCOIN:
 			SHA256((uchar*)sctx->job.coinbase, sctx->job.coinbase_size, (uchar*)merkle_root);
 			break;
 		default:
@ -1586,7 +1581,6 @@ static void *miner_thread(void *userdata)
 			switch (opt_algo) {
 			case ALGO_BLAKECOIN:
 			case ALGO_BLAKE:
 			case ALGO_WHIRLPOOLX:
 				minmax = 0x80000000U;
 				break;
 			case ALGO_KECCAK:
@ -1776,17 +1770,6 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 		/* to be deleted */
 		case ALGO_WHIRLCOIN:
 			rc = scanhash_whc(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_WHIRLPOOLX:
 			rc = scanhash_whirlpoolx(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_X11:
 			rc = scanhash_x11(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
@ -2779,12 +2762,6 @@ int main(int argc, char *argv[])
 		}
 	}
 	// extra credits..
 	if (opt_algo == ALGO_WHIRLPOOLX) {
 		printf("  Whirlpoolx support by Alexis Provos.\n");
 		printf("VNL donation address: Vr5oCen8NrY6ekBWFaaWjCUFBH4dyiS57W\n\n");
 	}
 	if (!opt_benchmark && !strlen(rpc_url)) {
 		fprintf(stderr, "%s: no URL supplied\n", argv[0]);
 		show_usage_and_exit(1);
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -499,19 +499,15 @@
    </CudaCompile>
    <CudaCompile Include="x13\x13.cu">
    </CudaCompile>
    <CudaCompile Include="x15\cuda_whirlpoolx.cu" />
    <CudaCompile Include="x15\cuda_x14_shabal512.cu">
    </CudaCompile>
    <CudaCompile Include="x15\cuda_x15_whirlpool.cu">
    </CudaCompile>
    <CudaCompile Include="x15\whirlpoolx.cu" />
    <CudaCompile Include="x17\cuda_x17_haval512.cu">
    </CudaCompile>
    <CudaCompile Include="x17\cuda_x17_sha512.cu">
      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="x15\whirlpool.cu">
    </CudaCompile>
    <CudaCompile Include="x15\x14.cu">
    </CudaCompile>
    <CudaCompile Include="x15\x15.cu">
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -541,21 +541,12 @@
    <CudaCompile Include="x15\cuda_x14_shabal512.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
    <CudaCompile Include="x15\whirlpool.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
    <CudaCompile Include="x15\whirlpoolx.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
    <CudaCompile Include="x15\x15.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
    <CudaCompile Include="x15\cuda_x15_whirlpool.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
    <CudaCompile Include="x15\cuda_whirlpoolx.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
    <CudaCompile Include="x17\cuda_x17_haval512.cu">
      <Filter>Source Files\CUDA\x17</Filter>
    </CudaCompile>
--- a/miner.h
+++ b/miner.h
@ -352,10 +352,6 @@ extern int scanhash_s3(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_whc(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_x11(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -376,10 +372,6 @@ extern int scanhash_x17(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_zr5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -696,7 +688,6 @@ void scryptjane_hash(void* output, const void* input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
 void s3hash(void *output, const void *input);
 void whirlxHash(void *state, const void *input);
 void x11hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
--- a/util.cpp
+++ b/util.cpp
@ -1815,9 +1815,6 @@ void print_hash_tests(void)
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);
 	whirlxHash(&hash[0], &buf[0]);
 	printpfx("whirlpoolx", hash);
 	x11hash(&hash[0], &buf[0]);
 	printpfx("X11", hash);
--- a/x15/cuda_whirlpoolx.cu
+++ b/x15/cuda_whirlpoolx.cu
@ -1,585 +0,0 @@
 /*
 * Built on cbuchner1's implementation, actual hashing code
 * based on sphlib 3.0
 */
 #include <stdio.h>
 #include <memory.h>
 #define threadsPerBlock 1024
 #include "cuda_helper.h"
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint64_t c_xtra[8];
 __constant__ uint64_t c_tmp[72];
 __constant__ uint64_t pTarget[4];
 static uint32_t *h_wxnounce[MAX_GPUS] = { 0 };
 static uint32_t *d_WXNonce[MAX_GPUS] = { 0 };
 /**
 * Whirlpool CUDA kernel implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014 djm34 & tpruvot & SP & Provos Alexis
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 * @author djm34
 * @author tpruvot
 * @author SP
 * @author Provos Alexis
 */
 __constant__ __align__(64) uint64_t mixTob0Tox[256];
 const uint64_t plain_T0[256]= {
 	0xD83078C018601818,0x2646AF05238C2323,0xB891F97EC63FC6C6,0xFBCD6F13E887E8E8,0xCB13A14C87268787,0x116D62A9B8DAB8B8,0x0902050801040101,0x0D9E6E424F214F4F,0x9B6CEEAD36D83636,
 	0xFF510459A6A2A6A6,0x0CB9BDDED26FD2D2,0x0EF706FBF5F3F5F5,0x96F280EF79F97979,0x30DECE5F6FA16F6F,0x6D3FEFFC917E9191,0xF8A407AA52555252,0x47C0FD27609D6060,0x35657689BCCABCBC,
 	0x372BCDAC9B569B9B,0x8A018C048E028E8E,0xD25B1571A3B6A3A3,0x6C183C600C300C0C,0x84F68AFF7BF17B7B,0x806AE1B535D43535,0xF53A69E81D741D1D,0xB3DD4753E0A7E0E0,0x21B3ACF6D77BD7D7,
 	0x9C99ED5EC22FC2C2,0x435C966D2EB82E2E,0x29967A624B314B4B,0x5DE121A3FEDFFEFE,0xD5AE168257415757,0xBD2A41A815541515,0xE8EEB69F77C17777,0x926EEBA537DC3737,0x9ED7567BE5B3E5E5,
 	0x1323D98C9F469F9F,0x23FD17D3F0E7F0F0,0x20947F6A4A354A4A,0x44A9959EDA4FDADA,0xA2B025FA587D5858,0xCF8FCA06C903C9C9,0x7C528D5529A42929,0x5A1422500A280A0A,0x507F4FE1B1FEB1B1,
 	0xC95D1A69A0BAA0A0,0x14D6DA7F6BB16B6B,0xD917AB5C852E8585,0x3C677381BDCEBDBD,0x8FBA34D25D695D5D,0x9020508010401010,0x07F503F3F4F7F4F4,0xDD8BC016CB0BCBCB,0xD37CC6ED3EF83E3E,
 	0x2D0A112805140505,0x78CEE61F67816767,0x97D55373E4B7E4E4,0x024EBB25279C2727,0x7382583241194141,0xA70B9D2C8B168B8B,0xF6530151A7A6A7A7,0xB2FA94CF7DE97D7D,0x4937FBDC956E9595,
 	0x56AD9F8ED847D8D8,0x70EB308BFBCBFBFB,0xCDC17123EE9FEEEE,0xBBF891C77CED7C7C,0x71CCE31766856666,0x7BA78EA6DD53DDDD,0xAF2E4BB8175C1717,0x458E460247014747,0x1A21DC849E429E9E,
 	0xD489C51ECA0FCACA,0x585A99752DB42D2D,0x2E637991BFC6BFBF,0x3F0E1B38071C0707,0xAC472301AD8EADAD,0xB0B42FEA5A755A5A,0xEF1BB56C83368383,0xB666FF8533CC3333,0x5CC6F23F63916363,
 	0x12040A1002080202,0x93493839AA92AAAA,0xDEE2A8AF71D97171,0xC68DCF0EC807C8C8,0xD1327DC819641919,0x3B92707249394949,0x5FAF9A86D943D9D9,0x31F91DC3F2EFF2F2,0xA8DB484BE3ABE3E3,
 	0xB9B62AE25B715B5B,0xBC0D9234881A8888,0x3E29C8A49A529A9A,0x0B4CBE2D26982626,0xBF64FA8D32C83232,0x597D4AE9B0FAB0B0,0xF2CF6A1BE983E9E9,0x771E33780F3C0F0F,0x33B7A6E6D573D5D5,
 	0xF41DBA74803A8080,0x27617C99BEC2BEBE,0xEB87DE26CD13CDCD,0x8968E4BD34D03434,0x3290757A483D4848,0x54E324ABFFDBFFFF,0x8DF48FF77AF57A7A,0x643DEAF4907A9090,0x9DBE3EC25F615F5F,
 	0x3D40A01D20802020,0x0FD0D56768BD6868,0xCA3472D01A681A1A,0xB7412C19AE82AEAE,0x7D755EC9B4EAB4B4,0xCEA8199A544D5454,0x7F3BE5EC93769393,0x2F44AA0D22882222,0x63C8E907648D6464,
 	0x2AFF12DBF1E3F1F1,0xCCE6A2BF73D17373,0x82245A9012481212,0x7A805D3A401D4040,0x4810284008200808,0x959BE856C32BC3C3,0xDFC57B33EC97ECEC,0x4DAB9096DB4BDBDB,0xC05F1F61A1BEA1A1,
 	0x9107831C8D0E8D8D,0xC87AC9F53DF43D3D,0x5B33F1CC97669797,0x0000000000000000,0xF983D436CF1BCFCF,0x6E5687452BAC2B2B,0xE1ECB39776C57676,0xE619B06482328282,0x28B1A9FED67FD6D6,
 	0xC33677D81B6C1B1B,0x74775BC1B5EEB5B5,0xBE432911AF86AFAF,0x1DD4DF776AB56A6A,0xEAA00DBA505D5050,0x578A4C1245094545,0x38FB18CBF3EBF3F3,0xAD60F09D30C03030,0xC4C3742BEF9BEFEF,
 	0xDA7EC3E53FFC3F3F,0xC7AA1C9255495555,0xDB591079A2B2A2A2,0xE9C96503EA8FEAEA,0x6ACAEC0F65896565,0x036968B9BAD2BABA,0x4A5E93652FBC2F2F,0x8E9DE74EC027C0C0,0x60A181BEDE5FDEDE,
 	0xFC386CE01C701C1C,0x46E72EBBFDD3FDFD,0x1F9A64524D294D4D,0x7639E0E492729292,0xFAEABC8F75C97575,0x360C1E3006180606,0xAE0998248A128A8A,0x4B7940F9B2F2B2B2,0x85D15963E6BFE6E6,
 	0x7E1C36700E380E0E,0xE73E63F81F7C1F1F,0x55C4F73762956262,0x3AB5A3EED477D4D4,0x814D3229A89AA8A8,0x5231F4C496629696,0x62EF3A9BF9C3F9F9,0xA397F666C533C5C5,0x104AB13525942525,
 	0xABB220F259795959,0xD015AE54842A8484,0xC5E4A7B772D57272,0xEC72DDD539E43939,0x1698615A4C2D4C4C,0x94BC3BCA5E655E5E,0x9FF085E778FD7878,0xE570D8DD38E03838,0x980586148C0A8C8C,
 	0x17BFB2C6D163D1D1,0xE4570B41A5AEA5A5,0xA1D94D43E2AFE2E2,0x4EC2F82F61996161,0x427B45F1B3F6B3B3,0x3442A51521842121,0x0825D6949C4A9C9C,0xEE3C66F01E781E1E,0x6186522243114343,
 	0xB193FC76C73BC7C7,0x4FE52BB3FCD7FCFC,0x2408142004100404,0xE3A208B251595151,0x252FC7BC995E9999,0x22DAC44F6DA96D6D,0x651A39680D340D0D,0x79E93583FACFFAFA,0x69A384B6DF5BDFDF,
 	0xA9FC9BD77EE57E7E,0x1948B43D24902424,0xFE76D7C53BEC3B3B,0x9A4B3D31AB96ABAB,0xF081D13ECE1FCECE,0x9922558811441111,0x8303890C8F068F8F,0x049C6B4A4E254E4E,0x667351D1B7E6B7B7,
 	0xE0CB600BEB8BEBEB,0xC178CCFD3CF03C3C,0xFD1FBF7C813E8181,0x4035FED4946A9494,0x1CF30CEBF7FBF7F7,0x186F67A1B9DEB9B9,0x8B265F98134C1313,0x51589C7D2CB02C2C,0x05BBB8D6D36BD3D3,
 	0x8CD35C6BE7BBE7E7,0x39DCCB576EA56E6E,0xAA95F36EC437C4C4,0x1B060F18030C0303,0xDCAC138A56455656,0x5E88491A440D4444,0xA0FE9EDF7FE17F7F,0x884F3721A99EA9A9,0x6754824D2AA82A2A,
 	0x0A6B6DB1BBD6BBBB,0x879FE246C123C1C1,0xF1A602A253515353,0x72A58BAEDC57DCDC,0x531627580B2C0B0B,0x0127D39C9D4E9D9D,0x2BD8C1476CAD6C6C,0xA462F59531C43131,0xF3E8B98774CD7474,
 	0x15F109E3F6FFF6F6,0x4C8C430A46054646,0xA5452609AC8AACAC,0xB50F973C891E8989,0xB42844A014501414,0xBADF425BE1A3E1E1,0xA62C4EB016581616,0xF774D2CD3AE83A3A,0x06D2D06F69B96969,
 	0x41122D4809240909,0xD7E0ADA770DD7070,0x6F7154D9B6E2B6B6,0x1EBDB7CED067D0D0,0xD6C77E3BED93EDED,0xE285DB2ECC17CCCC,0x6884572A42154242,0x2C2DC2B4985A9898,0xED550E49A4AAA4A4,
 	0x7550885D28A02828,0x86B831DA5C6D5C5C,0x6BED3F93F8C7F8F8,0xC211A44486228686
 };
 /**
 * Round constants.
 */
 __constant__ uint64_t InitVector_RC[10];
 const uint64_t plain_RC[10] = {
 	0x4F01B887E8C62318,0x52916F79F5D2A636,0x357B0CA38E9BBC60,0x57FE4B2EC2D7E01D,0xDA4AF09FE5377715,
 	0x856BA0B10A29C958,0x67053ECBF4105DBD,0xD8957DA78B4127E4,0x9E4717DD667CEEFB,0x33835AAD07BF2DCA
 };
 /* ====================================================================== */
 __device__ __forceinline__
 static uint64_t ROUND_ELT(const uint64_t* sharedMemory, const uint64_t* __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7)
 {
 	uint32_t* in32 = (uint32_t*)in;
 	return xor8(	sharedMemory[__byte_perm(in32[(i0 << 1)], 0, 0x4440)],
 			sharedMemory[__byte_perm(in32[(i1 << 1)], 0, 0x4441) + 256],
 			sharedMemory[__byte_perm(in32[(i2 << 1)], 0, 0x4442) + 512],
 			sharedMemory[__byte_perm(in32[(i3 << 1)], 0, 0x4443) + 768],
 			sharedMemory[__byte_perm(in32[(i4 << 1) + 1], 0, 0x4440) + 1024],
 			sharedMemory[__byte_perm(in32[(i5 << 1) + 1], 0, 0x4441) + 1280],
 			sharedMemory[__byte_perm(in32[(i6 << 1) + 1], 0, 0x4442) + 1536],
 			sharedMemory[__byte_perm(in32[(i7 << 1) + 1], 0, 0x4443) + 1792]);
 }
 #define TRANSFER(dst, src) { \
 	dst[0] = src ## 0; \
 	dst[1] = src ## 1; \
 	dst[2] = src ## 2; \
 	dst[3] = src ## 3; \
 	dst[4] = src ## 4; \
 	dst[5] = src ## 5; \
 	dst[6] = src ## 6; \
 	dst[7] = src ## 7; \
 }
 #define ROUND(table, in, out, c0, c1, c2, c3, c4, c5, c6, c7) { \
 	out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c0); \
 	out ## 1 = xor1(ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2), c1); \
 	out ## 2 = xor1(ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3), c2); \
 	out ## 3 = xor1(ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4), c3); \
 	out ## 4 = xor1(ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5), c4); \
 	out ## 5 = xor1(ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6), c5); \
 	out ## 6 = xor1(ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7), c6); \
 	out ## 7 = xor1(ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0), c7); \
 }
 #define ROUND1(table, in, out, c) { \
 	out ## 0 = xor1(ROUND_ELT(table, in, 0, 7, 6, 5, 4, 3, 2, 1), c); \
 	out ## 1 = ROUND_ELT(table, in, 1, 0, 7, 6, 5, 4, 3, 2); \
 	out ## 2 = ROUND_ELT(table, in, 2, 1, 0, 7, 6, 5, 4, 3); \
 	out ## 3 = ROUND_ELT(table, in, 3, 2, 1, 0, 7, 6, 5, 4); \
 	out ## 4 = ROUND_ELT(table, in, 4, 3, 2, 1, 0, 7, 6, 5); \
 	out ## 5 = ROUND_ELT(table, in, 5, 4, 3, 2, 1, 0, 7, 6); \
 	out ## 6 = ROUND_ELT(table, in, 6, 5, 4, 3, 2, 1, 0, 7); \
 	out ## 7 = ROUND_ELT(table, in, 7, 6, 5, 4, 3, 2, 1, 0); \
 }
 #define ROUND_KSCHED(table, in, out, c) \
 	ROUND1(table, in, out, c) \
 	TRANSFER(in, out)
 #define ROUND_WENC(table, in, key, out) \
 	ROUND(table, in, out, key[0], key[1], key[2],key[3], key[4], key[5], key[6], key[7]) \
 	TRANSFER(in, out)
 static uint64_t* d_xtra[MAX_GPUS] = { 0 };
 static uint64_t* d_tmp[MAX_GPUS] = { 0 };
 __device__ __forceinline__
 static void whirlpoolx_getShared(uint64_t* sharedMemory)
 {
 	if (threadIdx.x < 256) {
 		sharedMemory[threadIdx.x] = mixTob0Tox[threadIdx.x];
 		sharedMemory[threadIdx.x+256]  = ROTL64(sharedMemory[threadIdx.x], 8);
 		sharedMemory[threadIdx.x+512]  = ROTL64(sharedMemory[threadIdx.x],16);
 		sharedMemory[threadIdx.x+768]  = ROTL64(sharedMemory[threadIdx.x],24);
 		sharedMemory[threadIdx.x+1024] = ROTL64(sharedMemory[threadIdx.x],32);
 		sharedMemory[threadIdx.x+1280] = ROTR64(sharedMemory[threadIdx.x],24);
 		sharedMemory[threadIdx.x+1536] = ROTR64(sharedMemory[threadIdx.x],16);
 		sharedMemory[threadIdx.x+1792] = ROTR64(sharedMemory[threadIdx.x], 8);
 	}
 	__syncthreads();
 }
 __global__
 void whirlpoolx_gpu_precompute(uint32_t threads, uint64_t* d_xtra, uint64_t* d_tmp)
 {
 	__shared__ uint64_t sharedMemory[2048];
 	whirlpoolx_getShared(sharedMemory);
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t n[8];
 		uint64_t h[8] = { 0 };
 		#pragma unroll 8
 		for (int i=0; i<8; i++) {
 			n[i] = c_PaddedMessage80[i];  // read data
 		}
 		//#pragma unroll 10
 		for (unsigned int r=0; r < 10; r++) {
 			uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 			ROUND_KSCHED(sharedMemory, h, tmp, InitVector_RC[r]);
 			ROUND_WENC(sharedMemory, n, h, tmp);
 		}
 		#pragma unroll 8
 		for (int i=0; i < 8; i++) {
 			h[i] = xor1(n[i],c_PaddedMessage80[i]);
 		}
 		if(threadIdx.x==0)d_xtra[threadIdx.x]=h[1];
 		uint64_t atLastCalc=xor1(h[3],h[5]);
 		//////////////////////////////////
 		n[0] = c_PaddedMessage80[8];    //read data
 		n[1] = c_PaddedMessage80[9]; //whirlpool
 		n[2] = 0x0000000000000080; //whirlpool
 		n[3] = 0;
 		n[4] = 0;
 		n[5] = 0;
 		n[6] = 0;
 		n[7] = 0x8002000000000000;
 		n[0] = xor1(n[0],h[0]);
 		n[2] = xor1(n[2],h[2]);	n[3] = h[3];
 		n[4] = h[4];	n[5] = h[5];
 		n[6] = h[6];	n[7] = xor1(n[7],h[7]);
 		uint64_t tmp[8];
 		tmp[0] = xor1(ROUND_ELT(sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1),InitVector_RC[0]);
 		tmp[1] = ROUND_ELT(sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp[2] = ROUND_ELT(sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp[3] = ROUND_ELT(sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp[4] = ROUND_ELT(sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp[5] = ROUND_ELT(sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp[6] = ROUND_ELT(sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp[7] = ROUND_ELT(sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0);
 		uint64_t tmp2[8];
 		uint32_t* n32 = (uint32_t*)n;
 		tmp2[0]=xor8(	sharedMemory[__byte_perm(n32[ 0], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[14], 0, 0x4441) + 256],
 				sharedMemory[__byte_perm(n32[12], 0, 0x4442) + 512]	,sharedMemory[__byte_perm(n32[10], 0, 0x4443) + 768],
 				sharedMemory[__byte_perm(n32[ 9], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[ 7], 0, 0x4441) + 1280],
 				sharedMemory[__byte_perm(n32[ 5], 0, 0x4442) + 1536]	,tmp[0]);
 		tmp2[1]=xor8(	tmp[1]							,sharedMemory[__byte_perm(n32[ 0], 0, 0x4441) + 256],
 				sharedMemory[__byte_perm(n32[14], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[12], 0, 0x4443) + 768],
 				sharedMemory[__byte_perm(n32[11], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[ 9], 0, 0x4441) + 1280],
 				sharedMemory[__byte_perm(n32[ 7], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[ 5], 0, 0x4443) + 1792]);
 		tmp2[2]=xor8(	sharedMemory[__byte_perm(n32[ 4], 0, 0x4440)]  		,tmp[2]						    ,
 				sharedMemory[__byte_perm(n32[ 0], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[14], 0, 0x4443) + 768],
 				sharedMemory[__byte_perm(n32[13], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[11], 0, 0x4441) + 1280],
 				sharedMemory[__byte_perm(n32[ 9], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[ 7], 0, 0x4443) + 1792]);
 		tmp2[3]=xor8(	sharedMemory[__byte_perm(n32[ 6], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[ 4], 0, 0x4441) + 256],
 				tmp[3]							,sharedMemory[__byte_perm(n32[ 0], 0, 0x4443) + 768],
 				sharedMemory[__byte_perm(n32[15], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[13], 0, 0x4441) + 1280],
 				sharedMemory[__byte_perm(n32[11], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[ 9], 0, 0x4443) + 1792]);
 		tmp2[4]=xor8(	sharedMemory[__byte_perm(n32[ 8], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[ 6], 0, 0x4441) + 256]  ,
 				sharedMemory[__byte_perm(n32[ 4], 0, 0x4442) +  512]	,tmp[4]						      ,
 				sharedMemory[__byte_perm(n32[ 1], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[15], 0, 0x4441) + 1280] ,
 				sharedMemory[__byte_perm(n32[13], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[11], 0, 0x4443) + 1792]);
 		tmp2[5]=xor8(	sharedMemory[__byte_perm(n32[10], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[ 8], 0, 0x4441) + 256],
 				sharedMemory[__byte_perm(n32[ 6], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[ 4], 0, 0x4443) + 768],
 				tmp[5]							,sharedMemory[__byte_perm(n32[ 1], 0, 0x4441) + 1280],
 				sharedMemory[__byte_perm(n32[15], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[13], 0, 0x4443) + 1792]);
 		tmp2[6]=xor8(	sharedMemory[__byte_perm(n32[12], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[10], 0, 0x4441) + 256],
 				sharedMemory[__byte_perm(n32[ 8], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[ 6], 0, 0x4443) + 768],
 				sharedMemory[__byte_perm(n32[ 5], 0, 0x4440) + 1024]	,tmp[6],
 				sharedMemory[__byte_perm(n32[ 1], 0, 0x4442) + 1536]	,sharedMemory[__byte_perm(n32[15], 0, 0x4443) + 1792]);
 		tmp2[7]=xor8(	sharedMemory[__byte_perm(n32[14], 0, 0x4440)]  		,sharedMemory[__byte_perm(n32[12], 0, 0x4441) + 256],
 				sharedMemory[__byte_perm(n32[10], 0, 0x4442) +  512]	,sharedMemory[__byte_perm(n32[ 8], 0, 0x4443) + 768],
 				sharedMemory[__byte_perm(n32[ 7], 0, 0x4440) + 1024]	,sharedMemory[__byte_perm(n32[ 5], 0, 0x4441) + 1280],
 				tmp[7]							,sharedMemory[__byte_perm(n32[ 1], 0, 0x4443) + 1792]);
 		n[1] ^= h[1];
 		tmp2[1]^=sharedMemory[__byte_perm(n32[2], 0, 0x4440)];
 		tmp2[2]^=sharedMemory[__byte_perm(n32[2], 0, 0x4441) + 256];
 		tmp2[3]^=sharedMemory[__byte_perm(n32[2], 0, 0x4442) + 512];
 		tmp2[4]^=sharedMemory[__byte_perm(n32[2], 0, 0x4443) + 768];
 		d_tmp[threadIdx.x]=tmp2[threadIdx.x];
 		uint64_t tmp3[8];
 		tmp3[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[1]);
 		tmp3[1] = ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp3[2] = ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp3[3] = ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp3[4] = ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp3[5] = ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp3[6] = ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp3[7] = ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0);
 		n32 = (uint32_t*)tmp2;
 		uint64_t tmp4[8];
 		tmp4[0]=(	sharedMemory[__byte_perm(n32[ 9], 0, 0x4440) + 1024]	^sharedMemory[__byte_perm(n32[ 7], 0, 0x4441) + 1280]^
 			sharedMemory[__byte_perm(n32[ 5], 0, 0x4442) + 1536]	^sharedMemory[__byte_perm(n32[ 3], 0, 0x4443) + 1792]) ^tmp3[0];
 		tmp4[1]=(sharedMemory[__byte_perm(n32[ 2], 0, 0x4440)]		^sharedMemory[__byte_perm(n32[ 9], 0, 0x4441) + 1280]^
 			sharedMemory[__byte_perm(n32[ 7], 0, 0x4442) + 1536]	^sharedMemory[__byte_perm(n32[ 5], 0, 0x4443) + 1792]) ^tmp3[1];
 		tmp4[2]=(sharedMemory[__byte_perm(n32[ 4], 0, 0x4440)]  	^sharedMemory[__byte_perm(n32[ 2], 0, 0x4441) + 256]^
 			sharedMemory[__byte_perm(n32[ 9], 0, 0x4442) + 1536]	^sharedMemory[__byte_perm(n32[ 7], 0, 0x4443) + 1792]) ^tmp3[2];
 		tmp4[3]=(sharedMemory[__byte_perm(n32[ 6], 0, 0x4440)]  	^sharedMemory[__byte_perm(n32[ 4], 0, 0x4441) + 256]^
 			sharedMemory[__byte_perm(n32[ 2], 0, 0x4442) +  512]	^sharedMemory[__byte_perm(n32[ 9], 0, 0x4443) + 1792]) ^tmp3[3];
 		tmp4[4]=(sharedMemory[__byte_perm(n32[ 8], 0, 0x4440)]  	^sharedMemory[__byte_perm(n32[ 6], 0, 0x4441) + 256]^
 			sharedMemory[__byte_perm(n32[ 4], 0, 0x4442) +  512]	^sharedMemory[__byte_perm(n32[ 2], 0, 0x4443) + 768]) ^tmp3[4];
 		tmp4[5]=(sharedMemory[__byte_perm(n32[ 8], 0, 0x4441) + 256]	^sharedMemory[__byte_perm(n32[ 6], 0, 0x4442) +  512]^
 			sharedMemory[__byte_perm(n32[ 4], 0, 0x4443) + 768]	^sharedMemory[__byte_perm(n32[ 3], 0, 0x4440) + 1024]) ^tmp3[5];
 		tmp4[6]=(sharedMemory[__byte_perm(n32[ 8], 0, 0x4442) + 512]	^sharedMemory[__byte_perm(n32[ 6], 0, 0x4443) + 768]^
 			sharedMemory[__byte_perm(n32[ 5], 0, 0x4440) + 1024]	^sharedMemory[__byte_perm(n32[ 3], 0, 0x4441) + 1280]) ^tmp3[6];
 		tmp4[7]=(sharedMemory[__byte_perm(n32[ 8], 0, 0x4443) + 768]	^sharedMemory[__byte_perm(n32[ 7], 0, 0x4440) + 1024]^
 			sharedMemory[__byte_perm(n32[ 5], 0, 0x4441) + 1280]	^sharedMemory[__byte_perm(n32[ 3], 0, 0x4442) + 1536]) ^tmp3[7];
 		d_tmp[threadIdx.x+16]=tmp4[threadIdx.x];
 		uint64_t tmp5[8];
 		tmp5[0] = xor1(ROUND_ELT(sharedMemory, tmp3, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[2]);
 		tmp5[1] = ROUND_ELT(sharedMemory, tmp3, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp5[2] = ROUND_ELT(sharedMemory, tmp3, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp5[3] = ROUND_ELT(sharedMemory, tmp3, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp5[4] = ROUND_ELT(sharedMemory, tmp3, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp5[5] = ROUND_ELT(sharedMemory, tmp3, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp5[6] = ROUND_ELT(sharedMemory, tmp3, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp5[7] = ROUND_ELT(sharedMemory, tmp3, 7, 6, 5, 4, 3, 2, 1, 0);
 		d_tmp[threadIdx.x+8]=tmp5[threadIdx.x];
 		uint64_t tmp6[8];
 		tmp6[0] = xor1(ROUND_ELT(sharedMemory, tmp5, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[3]);
 		tmp6[1] = ROUND_ELT(sharedMemory, tmp5, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp6[2] = ROUND_ELT(sharedMemory, tmp5, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp6[3] = ROUND_ELT(sharedMemory, tmp5, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp6[4] = ROUND_ELT(sharedMemory, tmp5, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp6[5] = ROUND_ELT(sharedMemory, tmp5, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp6[6] = ROUND_ELT(sharedMemory, tmp5, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp6[7] = ROUND_ELT(sharedMemory, tmp5, 7, 6, 5, 4, 3, 2, 1, 0);
 		d_tmp[threadIdx.x+24]=tmp6[threadIdx.x];
 		uint64_t tmp7[8];
 		tmp7[0] = xor1(ROUND_ELT(sharedMemory, tmp6, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[4]);
 		tmp7[1] = ROUND_ELT(sharedMemory, tmp6, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp7[2] = ROUND_ELT(sharedMemory, tmp6, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp7[3] = ROUND_ELT(sharedMemory, tmp6, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp7[4] = ROUND_ELT(sharedMemory, tmp6, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp7[5] = ROUND_ELT(sharedMemory, tmp6, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp7[6] = ROUND_ELT(sharedMemory, tmp6, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp7[7] = ROUND_ELT(sharedMemory, tmp6, 7, 6, 5, 4, 3, 2, 1, 0);
 		d_tmp[threadIdx.x+32]=tmp7[threadIdx.x];
 		uint64_t tmp8[8];
 		tmp8[0] = xor1(ROUND_ELT(sharedMemory, tmp7, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[5]);
 		tmp8[1] = ROUND_ELT(sharedMemory, tmp7, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp8[2] = ROUND_ELT(sharedMemory, tmp7, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp8[3] = ROUND_ELT(sharedMemory, tmp7, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp8[4] = ROUND_ELT(sharedMemory, tmp7, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp8[5] = ROUND_ELT(sharedMemory, tmp7, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp8[6] = ROUND_ELT(sharedMemory, tmp7, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp8[7] = ROUND_ELT(sharedMemory, tmp7, 7, 6, 5, 4, 3, 2, 1, 0);
 		d_tmp[threadIdx.x+40]=tmp8[threadIdx.x];
 		uint64_t tmp9[8];
 		tmp9[0] = xor1(ROUND_ELT(sharedMemory, tmp8, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[6]);
 		tmp9[1] = ROUND_ELT(sharedMemory, tmp8, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp9[2] = ROUND_ELT(sharedMemory, tmp8, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp9[3] = ROUND_ELT(sharedMemory, tmp8, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp9[4] = ROUND_ELT(sharedMemory, tmp8, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp9[5] = ROUND_ELT(sharedMemory, tmp8, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp9[6] = ROUND_ELT(sharedMemory, tmp8, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp9[7] = ROUND_ELT(sharedMemory, tmp8, 7, 6, 5, 4, 3, 2, 1, 0);
 		d_tmp[threadIdx.x+48]=tmp9[threadIdx.x];
 		uint64_t tmp10[8];
 		tmp10[0] = xor1(ROUND_ELT(sharedMemory, tmp9, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[7]);
 		tmp10[1] = ROUND_ELT(sharedMemory, tmp9, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp10[2] = ROUND_ELT(sharedMemory, tmp9, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp10[3] = ROUND_ELT(sharedMemory, tmp9, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp10[4] = ROUND_ELT(sharedMemory, tmp9, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp10[5] = ROUND_ELT(sharedMemory, tmp9, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp10[6] = ROUND_ELT(sharedMemory, tmp9, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp10[7] = ROUND_ELT(sharedMemory, tmp9, 7, 6, 5, 4, 3, 2, 1, 0);
 		d_tmp[threadIdx.x+56]=tmp10[threadIdx.x];
 		uint64_t tmp11[8];
 		tmp11[0] = xor1(ROUND_ELT(sharedMemory, tmp10, 0, 7, 6, 5, 4, 3, 2, 1), InitVector_RC[8]);
 		tmp11[1] = ROUND_ELT(sharedMemory, tmp10, 1, 0, 7, 6, 5, 4, 3, 2);
 		tmp11[2] = ROUND_ELT(sharedMemory, tmp10, 2, 1, 0, 7, 6, 5, 4, 3);
 		tmp11[3] = ROUND_ELT(sharedMemory, tmp10, 3, 2, 1, 0, 7, 6, 5, 4);
 		tmp11[4] = ROUND_ELT(sharedMemory, tmp10, 4, 3, 2, 1, 0, 7, 6, 5);
 		tmp11[5] = ROUND_ELT(sharedMemory, tmp10, 5, 4, 3, 2, 1, 0, 7, 6);
 		tmp11[6] = ROUND_ELT(sharedMemory, tmp10, 6, 5, 4, 3, 2, 1, 0, 7);
 		tmp11[7] = ROUND_ELT(sharedMemory, tmp10, 7, 6, 5, 4, 3, 2, 1, 0);
 		d_tmp[threadIdx.x+64]=tmp11[threadIdx.x];
 		if(threadIdx.x==1){
 			tmp[0]=ROUND_ELT(sharedMemory,tmp11, 3, 2, 1, 0, 7, 6, 5, 4);
 			tmp[1]=ROUND_ELT(sharedMemory,tmp11, 5, 4, 3, 2, 1, 0, 7, 6);
 			tmp[4] = xor3(tmp[0],tmp[1],atLastCalc);
 			d_xtra[threadIdx.x]=tmp[4];
 		}
 	}
 }
 __global__ __launch_bounds__(threadsPerBlock,2)
 void whirlpoolx_gpu_hash(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
 {
 	__shared__ uint64_t sharedMemory[2048];
 	whirlpoolx_getShared(sharedMemory);
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t n[8];
 		uint64_t tmp[8];
 		uint32_t nounce = startNounce + thread;
 		n[1] = xor1(REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)),c_xtra[0]);
 		uint32_t* n32 = (uint32_t*)&n[0];
 		n[0]=sharedMemory[__byte_perm(n32[3], 0, 0x4443) + 1792];
 		n[5]=sharedMemory[__byte_perm(n32[3], 0, 0x4440) + 1024];
 		n[6]=sharedMemory[__byte_perm(n32[3], 0, 0x4441) + 1280];
 		n[7]=sharedMemory[__byte_perm(n32[3], 0, 0x4442) + 1536];
 		n[0]=xor1(c_tmp[0],n[0]);
 		n[1]=c_tmp[1];
 		n[2]=c_tmp[2];
 		n[3]=c_tmp[3];
 		n[4]=c_tmp[4];
 		n[5]=xor1(c_tmp[5],n[5]);
 		n[6]=xor1(c_tmp[6],n[6]);
 		n[7]=xor1(c_tmp[7],n[7]);
 		tmp[0]=xor3(sharedMemory[__byte_perm(n32[10],0,0x4443)+768],sharedMemory[__byte_perm(n32[12],0,0x4442)+512],sharedMemory[__byte_perm(n32[14],0,0x4441)+256]);
 		tmp[1]=xor3(sharedMemory[__byte_perm(n32[11],0,0x4440)+1024],sharedMemory[__byte_perm(n32[12],0,0x4443)+768],sharedMemory[__byte_perm(n32[14],0,0x4442)+512]);
 		tmp[2]=xor3(sharedMemory[__byte_perm(n32[11],0,0x4441)+1280],sharedMemory[__byte_perm(n32[13],0,0x4440)+1024],sharedMemory[__byte_perm(n32[14],0,0x4443)+768]);
 		tmp[3]=xor3(sharedMemory[__byte_perm(n32[11],0,0x4442)+1536],sharedMemory[__byte_perm(n32[13],0,0x4441)+1280],sharedMemory[__byte_perm(n32[15],0,0x4440)+1024]);
 		tmp[4]=xor3(sharedMemory[__byte_perm(n32[11],0,0x4443)+1792],sharedMemory[__byte_perm(n32[13],0,0x4442)+1536],sharedMemory[__byte_perm(n32[15],0,0x4441)+1280]);
 		tmp[5]=xor3(sharedMemory[__byte_perm(n32[10],0,0x4440)],sharedMemory[__byte_perm(n32[13],0,0x4443)+1792],sharedMemory[__byte_perm(n32[15],0,0x4442)+1536]);
 		tmp[6]=xor3(sharedMemory[__byte_perm(n32[12],0,0x4440)],sharedMemory[__byte_perm(n32[10],0,0x4441)+256],sharedMemory[__byte_perm(n32[15],0,0x4443)+1792]);
 		tmp[7]=xor3(sharedMemory[__byte_perm(n32[14],0,0x4440)],sharedMemory[__byte_perm(n32[12],0,0x4441)+256],sharedMemory[__byte_perm(n32[10],0,0x4442)+ 512]);
 		tmp[0]=xor3(sharedMemory[__byte_perm(n32[ 0], 0, 0x4440)],tmp[0],c_tmp[0+16]);
 		tmp[1]=xor3(sharedMemory[__byte_perm(n32[ 0], 0, 0x4441) + 256],tmp[1],c_tmp[1+16]);
 		tmp[2]=xor3(sharedMemory[__byte_perm(n32[ 0], 0, 0x4442) +  512],tmp[2],c_tmp[2+16]);
 		tmp[3]=xor3(sharedMemory[__byte_perm(n32[ 0], 0, 0x4443) + 768],tmp[3],c_tmp[3+16]);
 		tmp[4]=xor3(sharedMemory[__byte_perm(n32[ 1], 0, 0x4440) + 1024],tmp[4],c_tmp[4+16]);
 		tmp[5]=xor3(sharedMemory[__byte_perm(n32[ 1], 0, 0x4441) + 1280],tmp[5],c_tmp[5+16]);
 		tmp[6]=xor3(sharedMemory[__byte_perm(n32[ 1], 0, 0x4442) + 1536],tmp[6],c_tmp[6+16]);
 		tmp[7]=xor3(sharedMemory[__byte_perm(n32[ 1], 0, 0x4443) + 1792],tmp[7],c_tmp[7+16]);
 		n[0]=tmp[0];
 		n[1]=tmp[1];
 		n[2]=tmp[2];
 		n[3]=tmp[3];
 		n[4]=tmp[4];
 		n[5]=tmp[5];
 		n[6]=tmp[6];
 		n[7]=tmp[7];
 		tmp[0] = xor1(ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+8]);
 		tmp[1] = xor1(ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+8]);
 		tmp[2] = xor1(ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+8]);
 		tmp[3] = xor1(ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+8]);
 		tmp[4] = xor1(ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+8]);
 		tmp[5] = xor1(ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+8]);
 		tmp[6] = xor1(ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+8]);
 		tmp[7] = xor1(ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+8]);
 		n[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+24]);
 		n[1] = xor1(ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+24]);
 		n[2] = xor1(ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+24]);
 		n[3] = xor1(ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+24]);
 		n[4] = xor1(ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+24]);
 		n[5] = xor1(ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+24]);
 		n[6] = xor1(ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+24]);
 		n[7] = xor1(ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+24]);
 		tmp[0] = xor1(ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+32]);
 		tmp[1] = xor1(ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+32]);
 		tmp[2] = xor1(ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+32]);
 		tmp[3] = xor1(ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+32]);
 		tmp[4] = xor1(ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+32]);
 		tmp[5] = xor1(ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+32]);
 		tmp[6] = xor1(ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+32]);
 		tmp[7] = xor1(ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+32]);
 		n[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+40]);
 		n[1] = xor1(ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+40]);
 		n[2] = xor1(ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+40]);
 		n[3] = xor1(ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+40]);
 		n[4] = xor1(ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+40]);
 		n[5] = xor1(ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+40]);
 		n[6] = xor1(ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+40]);
 		n[7] = xor1(ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+40]);
 		tmp[0] = xor1(ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+48]);
 		tmp[1] = xor1(ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+48]);
 		tmp[2] = xor1(ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+48]);
 		tmp[3] = xor1(ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+48]);
 		tmp[4] = xor1(ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+48]);
 		tmp[5] = xor1(ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+48]);
 		tmp[6] = xor1(ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+48]);
 		tmp[7] = xor1(ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+48]);
 		n[0] = xor1(ROUND_ELT(sharedMemory, tmp, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+56]);
 		n[1] = xor1(ROUND_ELT(sharedMemory, tmp, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+56]);
 		n[2] = xor1(ROUND_ELT(sharedMemory, tmp, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+56]);
 		n[3] = xor1(ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+56]);
 		n[4] = xor1(ROUND_ELT(sharedMemory, tmp, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+56]);
 		n[5] = xor1(ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+56]);
 		n[6] = xor1(ROUND_ELT(sharedMemory, tmp, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+56]);
 		n[7] = xor1(ROUND_ELT(sharedMemory, tmp, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+56]);
 		tmp[0] = xor1(ROUND_ELT(sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1), c_tmp[0+64]);
 		tmp[1] = xor1(ROUND_ELT(sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2), c_tmp[1+64]);
 		tmp[2] = xor1(ROUND_ELT(sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3), c_tmp[2+64]);
 		tmp[3] = xor1(ROUND_ELT(sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4), c_tmp[3+64]);
 		tmp[4] = xor1(ROUND_ELT(sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5), c_tmp[4+64]);
 		tmp[5] = xor1(ROUND_ELT(sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6), c_tmp[5+64]);
 		tmp[6] = xor1(ROUND_ELT(sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7), c_tmp[6+64]);
 		tmp[7] = xor1(ROUND_ELT(sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0), c_tmp[7+64]);
 		if (xor3(c_xtra[1], ROUND_ELT(sharedMemory, tmp, 3, 2, 1, 0, 7, 6, 5, 4), ROUND_ELT(sharedMemory, tmp, 5, 4, 3, 2, 1, 0, 7, 6)) <= pTarget[3]) {
 			atomicMin(&resNounce[0], nounce);
 		}
 	}
 }
 __host__
 extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
 	cudaMalloc(&d_WXNonce[thr_id], sizeof(uint32_t));
 	cudaMallocHost(&h_wxnounce[thr_id], sizeof(uint32_t));
 	cudaMalloc(&d_xtra[thr_id], 8 * sizeof(uint64_t));
 	CUDA_SAFE_CALL(cudaMalloc(&d_tmp[thr_id], 8 * 9 * sizeof(uint64_t))); // d_tmp[threadIdx.x+64] (7+64)
 }
 __host__
 void whirlpoolx_setBlock_80(void *pdata, const void *ptarget)
 {
 	uint64_t PaddedMessage[16];
 	memcpy(PaddedMessage, pdata, 80);
 	memset((uint8_t*)&PaddedMessage+80, 0, 48);
 	((uint8_t*)PaddedMessage)[80] = 0x80; /* ending */
 	cudaMemcpyToSymbol(pTarget, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 }
 __host__
 void whirlpoolx_precompute(int thr_id)
 {
 	dim3 grid(1);
 	dim3 block(256);
 	whirlpoolx_gpu_precompute <<<grid, block>>>(8, d_xtra[thr_id], d_tmp[thr_id]);
 	cudaThreadSynchronize();
 	cudaMemcpyToSymbol(c_xtra, d_xtra[thr_id], 8 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_tmp, d_tmp[thr_id], 8 * 9 * sizeof(uint64_t), 0, cudaMemcpyDeviceToDevice));
 }
 __host__
 uint32_t whirlpoolx_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce)
 {
 	dim3 grid((threads + threadsPerBlock-1) / threadsPerBlock);
 	dim3 block(threadsPerBlock);
 	cudaMemset(d_WXNonce[thr_id], 0xff, sizeof(uint32_t));
 	whirlpoolx_gpu_hash<<<grid, block>>>(threads, startNounce, d_WXNonce[thr_id]);
 	cudaThreadSynchronize();
 	cudaMemcpy(h_wxnounce[thr_id], d_WXNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	return *(h_wxnounce[thr_id]);
 }
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@ -1,119 +0,0 @@
 /*
 * whirlpool routine (djm)
 */
 extern "C"
 {
 #include "sph/sph_whirlpool.h"
 #include "miner.h"
 }
 #include "cuda_helper.h"
 static uint32_t *d_hash[MAX_GPUS];
 extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void whirlpool512_setBlock_80(void *pdata, const void *ptarget);
 extern void whirlpool512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
 extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 // CPU Hash function
 extern "C" void wcoinhash(void *state, const void *input)
 {
 	sph_whirlpool_context ctx_whirlpool;
 	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
 	#define hashB hash+64
 	memset(hash, 0, sizeof hash);
 	// shavite 1
 	sph_whirlpool1_init(&ctx_whirlpool);
 	sph_whirlpool1(&ctx_whirlpool, input, 80);
 	sph_whirlpool1_close(&ctx_whirlpool, hash);
 	sph_whirlpool1_init(&ctx_whirlpool);
 	sph_whirlpool1(&ctx_whirlpool, hash, 64);
 	sph_whirlpool1_close(&ctx_whirlpool, hashB);
 	sph_whirlpool1_init(&ctx_whirlpool);
 	sph_whirlpool1(&ctx_whirlpool, hashB, 64);
 	sph_whirlpool1_close(&ctx_whirlpool, hash);
 	sph_whirlpool1_init(&ctx_whirlpool);
 	sph_whirlpool1(&ctx_whirlpool, hash, 64);
 	sph_whirlpool1_close(&ctx_whirlpool, hash);
 	memcpy(state, hash, 32);
 }
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_whc(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
    unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
 	uint32_t throughput =  device_intensity(thr_id, __func__, 1U << 19); // 19=256*256*8;
 	throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
 		// Konstanten kopieren, Speicher belegen
 		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
 		x15_whirlpool_cpu_init(thr_id, throughput, 1 /* old whirlpool */);
 		init[thr_id] = true;
 	}
 	for (int k=0; k < 20; k++) {
 		be32enc(&endiandata[k], pdata[k]);
 	}
 	whirlpool512_setBlock_80((void*)endiandata, ptarget);
 	do {
 		uint32_t foundNonce;
 		int order = 0;
 		whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			wcoinhash(vhash64, endiandata);
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				#if 0
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				if (secNonce != 0) {
 					pdata[21] = secNonce;
 					res++;
 				}
 				#endif
 				pdata[19] = foundNonce;
 				return res;
 			} else {
 				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
 		pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@ -1,97 +0,0 @@
 /*
 * whirlpool routine (djm)
 * whirlpoolx routine (provos alexis, tpruvot)
 */
 extern "C" {
 #include "sph/sph_whirlpool.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 static uint32_t *d_hash[MAX_GPUS];
 extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads);
 extern void whirlpoolx_setBlock_80(void *pdata, const void *ptarget);
 extern uint32_t whirlpoolx_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce);
 extern void whirlpoolx_precompute(int thr_id);
 // CPU Hash function
 extern "C" void whirlxHash(void *state, const void *input)
 {
 	sph_whirlpool_context ctx_whirlpool;
 	unsigned char hash[64];
 	unsigned char hash_xored[32];
 	sph_whirlpool_init(&ctx_whirlpool);
 	sph_whirlpool(&ctx_whirlpool, input, 80);
 	sph_whirlpool_close(&ctx_whirlpool, hash);
 	// compress the 48 first bytes of the hash to 32
 	for (int i = 0; i < 32; i++) {
 		hash_xored[i] = hash[i] ^ hash[i + 16];
 	}
 	memcpy(state, hash_xored, 32);
 }
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
 	int intensity = is_windows() ? 20 : 22;
 	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
 	throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0);
 		whirlpoolx_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	for (int k=0; k < 20; k++) {
 		be32enc(&endiandata[k], pdata[k]);
 	}
 	whirlpoolx_setBlock_80((void*)endiandata, ptarget);
 	whirlpoolx_precompute(thr_id);
 	do {
 		uint32_t foundNonce = whirlpoolx_cpu_hash(thr_id, throughput, pdata[19]);
 		if (foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			whirlxHash(vhash64, endiandata);
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
 				return 1;
 			} else {
 				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
 		pdata[19] += throughput;
 		if (((uint64_t)pdata[19]+throughput) >= max_nonce) {
 			break;
 		}
 	} while (!work_restart[thr_id].restart);
 	*(hashes_done) = pdata[19] - first_nonce + 1;
 	return 0;
 }