Add skein algo (Skeincoin, Myriad, Unat...)

SKEIN512 + SHA256 Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
10 years ago · f86784ee56
10 changed files with 720 additions and 12 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -41,7 +41,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
 			  quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu  \
-			  cuda_nist5.cu pentablake.cu zr5.cu \
+			  cuda_nist5.cu pentablake.cu skein.cu zr5.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
@ -117,3 +117,7 @@ quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
 JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
 	$(NVCC) $(nvcc_FLAGS) -I cudpp-2.1/include --maxrregcount=80 -o $@ -c $<
 skein.o: skein.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
--- a/README.txt
+++ b/README.txt
@ -34,10 +34,11 @@ TalkCoin
 DarkCoin and other X11 coins
 Saffroncoin blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
 Keccak (Maxcoin)
 Deep, Doom and Qubit
 Keccak (Maxcoin)
 Pentablake (Blake 512 x5)
 1Coin Triple S
 Skein (Skein + SHA)
 Vertcoin Lyra2RE
 Ziftrcoin (ZR5)
@ -79,6 +80,7 @@ its command line interface and options.
                          quark       use to mine Quarkcoin
                          qubit       use to mine Qubit Algo
                          s3          use to mine 1coin
                          skein       use to mine Skeincoin
                          whirl       use to mine Whirlcoin
                          whirlpoolx  use to mine Vanillacoin
                          x11         use to mine DarkCoin
@ -187,6 +189,7 @@ features.
  Mar. 27th 2015  v1.6.0
                  Add the ZR5 Algo for Ziftcoin
                  Implement Skeincoin algo (skein + sha)
                  Import pluck (djm34) and whirlpoolx (alexis78) algos
                  Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs)
                  Default config file (also help to debug without command line)
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -100,6 +100,7 @@ enum sha_algos {
 	ALGO_PLUCK,
 	ALGO_QUARK,
 	ALGO_QUBIT,
 	ALGO_SKEIN,
 	ALGO_S3,
 	ALGO_WHIRLCOIN,
 	ALGO_WHIRLPOOLX,
@ -133,6 +134,7 @@ static const char *algo_names[] = {
 	"pluck",
 	"quark",
 	"qubit",
 	"skein",
 	"s3",
 	"whirl",
 	"whirlpoolx",
@ -249,6 +251,7 @@ Options:\n\
 			pluck       SupCoin\n\
 			quark       Quark\n\
 			qubit       Qubit\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			s3          S3 (1Coin)\n\
 			x11         X11 (DarkCoin)\n\
 			x13         X13 (MaruCoin)\n\
@ -1512,6 +1515,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_SKEIN:
 			rc = scanhash_skeincoin(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_S3:
 			rc = scanhash_s3(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -440,6 +440,9 @@
    </CudaCompile>
    <CudaCompile Include="lyra2\lyra2RE.cu" />
    <CudaCompile Include="lyra2\cuda_lyra2.cu" />
    <CudaCompile Include="skein.cu">
      <MaxRegCount>64</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="x11\cuda_x11_aes.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
    </CudaCompile>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -574,6 +574,9 @@
    <CudaCompile Include="groestl_functions_quad.cu">
      <Filter>Source Files\CUDA\quark</Filter>
    </CudaCompile>
    <CudaCompile Include="skein.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <Image Include="res\ccminer.ico">
--- a/miner.h
+++ b/miner.h
@ -346,6 +346,10 @@ extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
 	unsigned char *scratchbuf, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skeincoin(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_s3(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -675,6 +679,7 @@ void pentablakehash(void *output, const void *input);
 void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
 void quarkhash(void *state, const void *input);
 void qubithash(void *state, const void *input);
 void skeincoinhash(void *output, const void *input);
 void s3hash(void *output, const void *input);
 void wcoinhash(void *state, const void *input);
 void whirlxHash(void *state, const void *input);
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@ -65,12 +65,12 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar
 	do {
 		// GPU
-		uint32_t foundNounce = 0xFFFFFFFF;
+		uint32_t foundNounce = UINT32_MAX;
 		const uint32_t Htarg = ptarget[7];
 		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
-		if(foundNounce < 0xffffffff)
+		if (foundNounce < UINT32_MAX)
 		{
 			uint32_t tmpHash[8];
 			endiandata[19] = SWAP32(foundNounce);
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@ -4,6 +4,8 @@
 #include "cuda_helper.h"
 static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 // Take a look at: https://www.schneier.com/skein1.3.pdf
 #define SHL(x, n)			((x) << (n))
@ -377,7 +379,7 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
 		#pragma unroll 8
-		for(int i=0; i<8; i++)
+		for (int i = 0; i < 8; i++)
 			p[i] = vectorize(inpHash[i]);
 		t0 = vectorize(64); // ptr
@ -540,7 +542,7 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint
 		TFBIG_4o(17);
 		TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
-		// fertig
+		// output
 		uint64_t *outpHash = &g_hash[8 * hashPosition];
 		#pragma unroll 8
@ -549,6 +551,176 @@ void quark_skein512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint
 	}
 }
 __global__ __launch_bounds__(128,6)
 void skein512_gpu_hash_close(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint2 t0 = vectorize(8); // extra
 		uint2 t1 = vectorize(0xFF00000000000000ull); // etype
 		uint2 t2 = vectorize(0xB000000000000050ull);
 		uint64_t *state = &g_hash[8 * thread];
 		uint2 h0 = vectorize(state[0]);
 		uint2 h1 = vectorize(state[1]);
 		uint2 h2 = vectorize(state[2]);
 		uint2 h3 = vectorize(state[3]);
 		uint2 h4 = vectorize(state[4]);
 		uint2 h5 = vectorize(state[5]);
 		uint2 h6 = vectorize(state[6]);
 		uint2 h7 = vectorize(state[7]);
 		uint2 h8;
 		TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
 		uint2 p[8] = { 0 };
 		//#pragma unroll 8
 		//for (int i = 0; i<8; i++)
 		//	p[i] = make_uint2(0, 0);
 		TFBIG_4e_UI2(0);
 		TFBIG_4o_UI2(1);
 		TFBIG_4e_UI2(2);
 		TFBIG_4o_UI2(3);
 		TFBIG_4e_UI2(4);
 		TFBIG_4o_UI2(5);
 		TFBIG_4e_UI2(6);
 		TFBIG_4o_UI2(7);
 		TFBIG_4e_UI2(8);
 		TFBIG_4o_UI2(9);
 		TFBIG_4e_UI2(10);
 		TFBIG_4o_UI2(11);
 		TFBIG_4e_UI2(12);
 		TFBIG_4o_UI2(13);
 		TFBIG_4e_UI2(14);
 		TFBIG_4o_UI2(15);
 		TFBIG_4e_UI2(16);
 		TFBIG_4o_UI2(17);
 		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
 		uint64_t *outpHash = state;
 		#pragma unroll 8
 		for (int i = 0; i < 8; i++)
 			outpHash[i] = devectorize(p[i]);
 	}
 }
 __global__ __launch_bounds__(128,5)
 void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64, int swap)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// Skein
 		uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8;
 		uint2 t0, t1, t2;
 		// Init
 		h0 = vectorize(0x4903ADFF749C51CEull);
 		h1 = vectorize(0x0D95DE399746DF03ull);
 		h2 = vectorize(0x8FD1934127C79BCEull);
 		h3 = vectorize(0x9A255629FF352CB1ull);
 		h4 = vectorize(0x5DB62599DF6CA7B0ull);
 		h5 = vectorize(0xEABE394CA9D5C3F4ull);
 		h6 = vectorize(0x991112C71A75B523ull);
 		h7 = vectorize(0xAE18A40B660FCC33ull);
 		// 1st step -> etype = 0xE0, ptr = 64, bcount = 0, extra = 0
 		t0 = vectorize(64); // ptr
 		//t1 = vectorize(0xE0ull << 55); // etype
 		t1 = vectorize(0x7000000000000000ull);
 		TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
 		uint2 p[8];
 		#pragma unroll 8
 		for (int i = 0; i<8; i++)
 			p[i] = vectorize(c_PaddedMessage80[i]);
 		TFBIG_4e_UI2(0);
 		TFBIG_4o_UI2(1);
 		TFBIG_4e_UI2(2);
 		TFBIG_4o_UI2(3);
 		TFBIG_4e_UI2(4);
 		TFBIG_4o_UI2(5);
 		TFBIG_4e_UI2(6);
 		TFBIG_4o_UI2(7);
 		TFBIG_4e_UI2(8);
 		TFBIG_4o_UI2(9);
 		TFBIG_4e_UI2(10);
 		TFBIG_4o_UI2(11);
 		TFBIG_4e_UI2(12);
 		TFBIG_4o_UI2(13);
 		TFBIG_4e_UI2(14);
 		TFBIG_4o_UI2(15);
 		TFBIG_4e_UI2(16);
 		TFBIG_4o_UI2(17);
 		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
 		h0 = vectorize(c_PaddedMessage80[0]) ^ p[0];
 		h1 = vectorize(c_PaddedMessage80[1]) ^ p[1];
 		h2 = vectorize(c_PaddedMessage80[2]) ^ p[2];
 		h3 = vectorize(c_PaddedMessage80[3]) ^ p[3];
 		h4 = vectorize(c_PaddedMessage80[4]) ^ p[4];
 		h5 = vectorize(c_PaddedMessage80[5]) ^ p[5];
 		h6 = vectorize(c_PaddedMessage80[6]) ^ p[6];
 		h7 = vectorize(c_PaddedMessage80[7]) ^ p[7];
 		uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread;
 		uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage80[9]), nonce);
 		// skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16
 		p[0] = vectorize(c_PaddedMessage80[8]);
 		p[1] = nounce2;
 		#pragma unroll
 		for (int i = 2; i < 8; i++)
 			p[i] = make_uint2(0,0);
 		t0 = vectorize(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra);
 		t1 = vectorize(0xB000000000000000ull); // (bcount >> 58) + ((sph_u64)(etype) << 55);
 		TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
 		TFBIG_4e_UI2(0);
 		TFBIG_4o_UI2(1);
 		TFBIG_4e_UI2(2);
 		TFBIG_4o_UI2(3);
 		TFBIG_4e_UI2(4);
 		TFBIG_4o_UI2(5);
 		TFBIG_4e_UI2(6);
 		TFBIG_4o_UI2(7);
 		TFBIG_4e_UI2(8);
 		TFBIG_4o_UI2(9);
 		TFBIG_4e_UI2(10);
 		TFBIG_4o_UI2(11);
 		TFBIG_4e_UI2(12);
 		TFBIG_4o_UI2(13);
 		TFBIG_4e_UI2(14);
 		TFBIG_4o_UI2(15);
 		TFBIG_4e_UI2(16);
 		TFBIG_4o_UI2(17);
 		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
 		h0 = vectorize(c_PaddedMessage80[8]) ^ p[0];
 		h1 = nounce2 ^ p[1];
 		h2 = p[2];
 		h3 = p[3];
 		h4 = p[4];
 		h5 = p[5];
 		h6 = p[6];
 		h7 = p[7];
 		// skein_big_close 2nd loop -> etype = 0x1fe, ptr = 8, bcount = 0
 		uint64_t *outpHash = &output64[thread * 8];
 		outpHash[0] = devectorize(h0);
 		outpHash[1] = devectorize(h1);
 		outpHash[2] = devectorize(h2);
 		outpHash[3] = devectorize(h3);
 		outpHash[4] = devectorize(h4);
 		outpHash[5] = devectorize(h5);
 		outpHash[6] = devectorize(h6);
 		outpHash[7] = devectorize(h7);
 	}
 }
 __host__
 void quark_skein512_cpu_init(int thr_id, uint32_t threads)
 {
@ -559,7 +731,6 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
 {
 	const uint32_t threadsperblock = 256;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
@ -569,6 +740,31 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
 	else
 		quark_skein512_gpu_hash_64_v30 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 /* skeincoin */
 __host__
 void skein512_cpu_setBlock_80(void *pdata)
 {
 	uint32_t PaddedMessage[32] = { 0 };
 	memcpy(&PaddedMessage[0], pdata, 80);
 	CUDA_SAFE_CALL(
 		cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice)
 	);
 }
 __host__
 void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	// hash function is cut in 2 parts
 	skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, swap);
 	skein512_gpu_hash_close <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
 }
--- a/skein.cu
+++ b/skein.cu
@ -0,0 +1,480 @@
 /**
 * SKEIN512 80 + SHA256 64
 * by tpruvot@github - 2015
 */
 extern "C" {
 #include "sph/sph_skein.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 #include <openssl/sha.h>
 static uint32_t *d_hash[MAX_GPUS];
 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 static __device__ __constant__ uint32_t sha256_hashTable[] = {
 	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
 };
 static __device__ __constant__ uint32_t sha256_constantTable[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };
 static __device__ __constant__ uint32_t sha256_endingTable[] = {
 	0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
 	0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
 	0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
 	0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
 	0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
 	0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
 	0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484
 };
 /* Elementary functions used by SHA256 */
 #define SWAB32(x)     cuda_swab32(x)
 //#define ROTR32(x,n)   SPH_ROTR32(x,n)
 #define R(x, n)       ((x) >> (n))
 #define Ch(x, y, z)   ((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)  ((x & (y | z)) | (y & z))
 #define S0(x)         (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
 #define S1(x)         (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
 #define s0(x)         (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
 #define s1(x)         (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
 #define ADVANCED_SHA2
 #ifndef ADVANCED_SHA2
 /* SHA256 round function */
 #define RND(a, b, c, d, e, f, g, h, k) \
 	do { \
 		t0 = h + S1(e) + Ch(e, f, g) + k; \
 		t1 = S0(a) + Maj(a, b, c); \
 		d += t0; \
 		h  = t0 + t1; \
 	} while (0)
 /* Adjusted round function for rotating state */
 #define RNDr(S, W, i) \
 	RND(S[(64 - i) & 7], S[(65 - i) & 7], \
 	    S[(66 - i) & 7], S[(67 - i) & 7], \
 	    S[(68 - i) & 7], S[(69 - i) & 7], \
 	    S[(70 - i) & 7], S[(71 - i) & 7], \
 	    W[i] + sha256_constantTable[i])
 static __constant__ uint32_t sha256_ending[16] = {
 	0x80000000UL, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x200UL
 };
 __device__
 void sha256_transform_gpu(uint32_t *state, uint32_t *message)
 {
 	uint32_t S[8];
 	uint32_t W[64];
 	uint32_t t0, t1;
 	/* Initialize work variables. */
 	for (int i = 0; i < 8; i++) {
 		S[i] = state[i];
 	}
 	for (int i = 0; i < 16; i++) {
 		W[i] = message[i];
 	}
 	for (int i = 16; i < 64; i += 2) {
 		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
 		W[i + 1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
 	}
 	/* 3. Mix. */
 	RNDr(S, W, 0);
 	RNDr(S, W, 1);
 	RNDr(S, W, 2);
 	RNDr(S, W, 3);
 	RNDr(S, W, 4);
 	RNDr(S, W, 5);
 	RNDr(S, W, 6);
 	RNDr(S, W, 7);
 	RNDr(S, W, 8);
 	RNDr(S, W, 9);
 	RNDr(S, W, 10);
 	RNDr(S, W, 11);
 	RNDr(S, W, 12);
 	RNDr(S, W, 13);
 	RNDr(S, W, 14);
 	RNDr(S, W, 15);
 	RNDr(S, W, 16);
 	RNDr(S, W, 17);
 	RNDr(S, W, 18);
 	RNDr(S, W, 19);
 	RNDr(S, W, 20);
 	RNDr(S, W, 21);
 	RNDr(S, W, 22);
 	RNDr(S, W, 23);
 	RNDr(S, W, 24);
 	RNDr(S, W, 25);
 	RNDr(S, W, 26);
 	RNDr(S, W, 27);
 	RNDr(S, W, 28);
 	RNDr(S, W, 29);
 	RNDr(S, W, 30);
 	RNDr(S, W, 31);
 	RNDr(S, W, 32);
 	RNDr(S, W, 33);
 	RNDr(S, W, 34);
 	RNDr(S, W, 35);
 	RNDr(S, W, 36);
 	RNDr(S, W, 37);
 	RNDr(S, W, 38);
 	RNDr(S, W, 39);
 	RNDr(S, W, 40);
 	RNDr(S, W, 41);
 	RNDr(S, W, 42);
 	RNDr(S, W, 43);
 	RNDr(S, W, 44);
 	RNDr(S, W, 45);
 	RNDr(S, W, 46);
 	RNDr(S, W, 47);
 	RNDr(S, W, 48);
 	RNDr(S, W, 49);
 	RNDr(S, W, 50);
 	RNDr(S, W, 51);
 	RNDr(S, W, 52);
 	RNDr(S, W, 53);
 	RNDr(S, W, 54);
 	RNDr(S, W, 55);
 	RNDr(S, W, 56);
 	RNDr(S, W, 57);
 	RNDr(S, W, 58);
 	RNDr(S, W, 59);
 	RNDr(S, W, 60);
 	RNDr(S, W, 61);
 	RNDr(S, W, 62);
 	RNDr(S, W, 63);
 	for (int i = 0; i < 8; i++)
 		state[i] += S[i];
 }
 #endif
 #ifdef ADVANCED_SHA2
 __device__
 void skeincoin_gpu_sha256(uint32_t *message)
 {
 	uint32_t W1[16];
 	uint32_t W2[16];
 	uint32_t regs[8];
 	uint32_t hash[8];
 	// Init with Hash-Table
 	#pragma unroll 8
 	for (int k=0; k < 8; k++) {
 		hash[k] = regs[k] = sha256_hashTable[k];
 	}
 	#pragma unroll 16
 	for (int k = 0; k<16; k++)
 		W1[k] = SWAB32(message[k]);
 	// Progress W1
 	#pragma unroll 16
 	for (int j = 0; j<16; j++)
 	{
 		uint32_t T1, T2;
 		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j];
 		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
 		#pragma unroll 7
 		for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k];
 		regs[0] = T1 + T2;
 		regs[4] += T1;
 	}
 	// Progress W2...W3
 	////// PART 1
 	#pragma unroll 2
 	for (int j = 0; j<2; j++)
 		W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
 	#pragma unroll 5
 	for (int j = 2; j<7; j++)
 		W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
 	#pragma unroll 8
 	for (int j = 7; j<15; j++)
 		W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
 	W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
 	// Round function
 	#pragma unroll 16
 	for (int j = 0; j<16; j++)
 	{
 		uint32_t T1, T2;
 		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j];
 		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
 		#pragma unroll 7
 		for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
 		regs[0] = T1 + T2;
 		regs[4] += T1;
 	}
 	////// PART 2
 	#pragma unroll 2
 	for (int j = 0; j<2; j++)
 		W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
 	#pragma unroll 5
 	for (int j = 2; j<7; j++)
 		W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j];
 	#pragma unroll 8
 	for (int j = 7; j<15; j++)
 		W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j];
 	W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
 	// Round function
 	#pragma unroll 16
 	for (int j = 0; j<16; j++)
 	{
 		uint32_t T1, T2;
 		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j];
 		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
 		#pragma unroll 7
 		for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
 		regs[0] = T1 + T2;
 		regs[4] += T1;
 	}
 	////// PART 3
 	#pragma unroll 2
 	for (int j = 0; j<2; j++)
 		W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
 	#pragma unroll 5
 	for (int j = 2; j<7; j++)
 		W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j];
 	#pragma unroll 8
 	for (int j = 7; j<15; j++)
 		W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j];
 	W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
 	// Round function
 	#pragma unroll 16
 	for (int j = 0; j<16; j++)
 	{
 		uint32_t T1, T2;
 		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j];
 		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
 		#pragma unroll 7
 		for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l];
 		regs[0] = T1 + T2;
 		regs[4] += T1;
 	}
 	#pragma unroll 8
 	for (int k = 0; k<8; k++)
 		hash[k] += regs[k];
 #if 1
 	/////
 	///// Second Pass (ending)
 	/////
 	#pragma unroll 8
 	for (int k = 0; k<8; k++)
 		regs[k] = hash[k];
 	// Progress W1
 	#pragma unroll 64
 	for (int j = 0; j<64; j++)
 	{
 		uint32_t T1, T2;
 		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + sha256_endingTable[j];
 		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
 		#pragma unroll 7
 		for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k];
 		regs[0] = T1 + T2;
 		regs[4] += T1;
 	}
 	#pragma unroll 8
 	for (int k = 0; k<8; k++)
 		hash[k] += regs[k];
 	// Final Hash
 	#pragma unroll 8
 	for (int k = 0; k<8; k++)
 		message[k] = SWAB32(hash[k]);
 #else
 	// sha256_transform only, require an additional sha256_transform_gpu() call
 	#pragma unroll 8
 	for (int k = 0; k<8; k++)
 		message[k] = hash[k];
 #endif
 }
 #endif
 __global__
 void sha2_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *hashBuffer)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t *hash = &hashBuffer[thread << 4];
 #ifdef ADVANCED_SHA2
 		skeincoin_gpu_sha256(hash);
 #else
 		uint32_t state[16];
 		uint32_t msg[16];
 		#pragma unroll
 		for (int i = 0; i < 8; i++)
 			state[i] = sha256_hashTable[i];
 		#pragma unroll
 		for (int i = 0; i < 16; i++)
 			msg[i] = SWAB32(hash[i]);
 		sha256_transform_gpu(state, msg);
 		sha256_transform_gpu(state, sha256_ending);
 		#pragma unroll
 		for (int i = 0; i < 8; i++)
 			hash[i] = SWAB32(state[i]);
 #endif
 	}
 }
 __host__
 void sha2_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHashes, int order)
 {
 	uint32_t threadsperblock = 128;
 	dim3 block(threadsperblock);
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	//cudaMemset(d_outputHashes, 0, 64 * threads);
 	sha2_gpu_hash_64 <<< grid, block >>>(threads, startNounce, d_outputHashes);
 	MyStreamSynchronize(NULL, 0, thr_id);
 }
 extern "C" void skeincoinhash(void *output, const void *input)
 {
 	sph_skein512_context ctx_skein;
 	SHA256_CTX sha256;
 	uint32_t hash[16];
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, input, 80);
 	sph_skein512_close(&ctx_skein, hash);
 	SHA256_Init(&sha256);
 	SHA256_Update(&sha256, (unsigned char *)hash, 64);
 	SHA256_Final((unsigned char *)hash, &sha256);
 	memcpy(output, hash, 32);
 }
 static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
 	return iftrue ? swab32(val) : val;
 }
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata,
    const uint32_t *ptarget, uint32_t max_nonce,
    unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	const int swap = 1;
 	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
 	throughput = min(throughput,  (max_nonce - first_nonce));
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0FFF;
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	skein512_cpu_setBlock_80((void*)endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		int order = 0;
 		*hashes_done = pdata[19] - first_nonce + throughput;
 		// Hash with CUDA
 		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
 		sha2_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t vhash64[8];
 			endiandata[19] = swab32_if(foundNonce, swap);
 			skeincoinhash(vhash64, endiandata);
 			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint8_t num = res;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], num);
 				while (secNonce != 0 && res < 6)
 				{
 					endiandata[19] = swab32_if(secNonce, swap);
 					skeincoinhash(vhash64, endiandata);
 					if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 						pdata[19+res] = swab32_if(secNonce, !swap);
 						res++;
 					}
 					num++;
 					secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], num);
 				}
 				if (res > 1 && opt_debug)
 					applog(LOG_BLUE, "GPU #%d: %d/%d valid nonces !!!", device_map[thr_id], res, (int)num);
 				pdata[19] = swab32_if(foundNonce, !swap);
 				return res;
 			}
 			else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
 				pdata[19]++;
 			}
 		} else
 			pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
--- a/util.cpp
+++ b/util.cpp
@ -1701,9 +1701,12 @@ void do_gpu_tests(void)
 	scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done);
 	//memset(buf, 0, sizeof buf);
-	//scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
+	//scanhash_skeincoin(0, (uint32_t*)buf, tgt, 1, &done);
 	memset(buf, 0, sizeof buf);
 	scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
 	//memset(buf, 0, sizeof buf);
 	// buf[0] = 1; buf[64] = 2; // for endian tests
 	//scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
@ -1786,6 +1789,9 @@ void print_hash_tests(void)
 	qubithash(&hash[0], &buf[0]);
 	printpfx("qubit", hash);
 	skeincoinhash(&hash[0], &buf[0]);
 	printpfx("skein", hash);
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);