scrypt: strip keccak/blake 256 remains

10 years ago · 22c28ccbef
9 changed files with 15 additions and 2323 deletions
--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@ -240,13 +240,12 @@ static void scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac)
 *  - mikaelh
 */
 static void scrypt_pbkdf2_1(const uint8_t *password, size_t password_len,
-	const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes)
+	const uint8_t *salt, size_t salt_len, uint8_t *out, uint64_t bytes)
 {
 	scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
 	scrypt_hash_digest ti, u;
 	uint8_t be[4];
-	uint32_t i, /*j,*/ blocks;
+	uint32_t i, blocks;
 //	uint64_t c;
 	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
@ -266,7 +265,7 @@ static void scrypt_pbkdf2_1(const uint8_t *password, size_t password_len,
 		scrypt_hmac_finish(&work, ti);
 		memcpy(u, ti, sizeof(u));
-		memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+		memcpy(out, ti, (size_t) (bytes > SCRYPT_HASH_DIGEST_SIZE ? SCRYPT_HASH_DIGEST_SIZE : bytes));
 		out += SCRYPT_HASH_DIGEST_SIZE;
 		bytes -= SCRYPT_HASH_DIGEST_SIZE;
 	}
@ -631,7 +630,7 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u
 static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, const uchar*salt, size_t salt_len, uint32_t N,
-	uchar *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V)
+	uchar *out, uint32_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V)
 {
 	uint32_t chunk_bytes, i;
 	const uint32_t p = SCRYPT_P;
@ -650,7 +649,7 @@ static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, con
 		scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N);
 	/* 3: Out = PBKDF2(password, X) */
-	scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes);
+	scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, (size_t) bytes);
 #ifdef SCRYPT_PREVENT_STATE_LEAK
 	/* This is an unnecessary security feature - mikaelh */
@ -661,7 +660,7 @@ static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, con
 /* for cpu hash test */
 void scryptjane_hash(void* output, const void* input)
 {
-	uint64_t Nsize = 1ULL << (opt_nfactor + 1);
+	uint32_t Nsize = 1UL << (opt_nfactor + 1);
 	uint64_t chunk_bytes;
 	uint8_t *X, *Y;
 	scrypt_aligned_alloc YX, V;
@ -670,12 +669,12 @@ void scryptjane_hash(void* output, const void* input)
 	V  = scrypt_alloc(Nsize * chunk_bytes);
 	YX = scrypt_alloc((SCRYPT_P + 1) * chunk_bytes);
-	memset(V.ptr, 0, Nsize * chunk_bytes);
+	memset(V.ptr, 0, (size_t) (Nsize * chunk_bytes));
 	Y = YX.ptr;
 	X = Y + chunk_bytes;
-	scrypt_jane_hash_1_1((uchar*)input, 80, (uchar*)input, 80, Nsize, (uchar*)output, 32, X, Y, V.ptr);
+	scrypt_jane_hash_1_1((uchar*)input, 80, (uchar*)input, 80, (uint32_t) Nsize, (uchar*)output, 32, X, Y, V.ptr);
 	scrypt_free(&V);
 	scrypt_free(&YX);
--- a/scrypt.cpp
+++ b/scrypt.cpp
@ -994,12 +994,12 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
 */
 static void scrypt_core(uint32_t *X, uint32_t *V, uint32_t N)
 {
-	for (int i = 0; i < N; i++) {
+	for (uint32_t i = 0; i < N; i++) {
 		memcpy(&V[i * 32], X, 128);
 		xor_salsa8(&X[0], &X[16]);
 		xor_salsa8(&X[16], &X[0]);
 	}
-	for (int i = 0; i < N; i++) {
+	for (uint32_t i = 0; i < N; i++) {
 		uint32_t j = 32 * (X[16] & (N - 1));
 		for (uint8_t k = 0; k < 32; k++)
 			X[k] ^= V[j + k];
--- a/scrypt/keccak.cu
+++ b/scrypt/keccak.cu
@ -4,21 +4,16 @@
 // The keccak512 (SHA-3) is used in the PBKDF2 for scrypt-jane coins
 // in place of the SHA2 based PBKDF2 used in scrypt coins.
 //
-// The keccak256 is used exclusively in Maxcoin and clones. This module
+// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=64
 // holds the generic "default" implementation when no architecture
 // specific implementation is available in the kernel.
 //
 // NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
 //
 #include <map>
 #include <stdint.h>
 #include "salsa_kernel.h"
 #include "cuda_runtime.h"
 #include "miner.h"
 #include "cuda_helper.h"
 #include "keccak.h"
 #include "salsa_kernel.h"
 // define some error checking macros
 #undef checkCudaErrors
@ -45,7 +40,9 @@ extern std::map<int, uint32_t *> context_odata[2];
 extern std::map<int, cudaStream_t> context_streams[2];
 extern std::map<int, uint32_t *> context_hash[2];
 #ifndef ROTL64
 #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
 #endif
 // CB
 #define U32TO64_LE(p) \
@ -375,11 +372,6 @@ __device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) {
 // ---------------------------- END PBKDF2 functions ------------------------------------
 static __device__ uint32_t cuda_swab32(uint32_t x) {
 	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
 		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
 }
 __global__ __launch_bounds__(128)
 void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
 {
@ -486,352 +478,3 @@ extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throu
 	cuda_post_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_odata[stream][thr_id], context_hash[stream][thr_id], nonce);
 }
 //
 // Maxcoin related Keccak implementation (Keccak256)
 //
 #include <stdint.h>
 #include <map>
 extern std::map<int, int> context_blocks;
 extern std::map<int, int> context_wpb;
 extern std::map<int, KernelInterface *> context_kernel;
 __constant__ uint64_t ptarget64[4];
 #define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64))))
 #define ROL_mult8(a, offset) ROL(a, offset)
 __constant__ uint64_t KeccakF_RoundConstants[24];
 static uint64_t host_KeccakF_RoundConstants[24] = {
 	(uint64_t)0x0000000000000001ULL,
 	(uint64_t)0x0000000000008082ULL,
 	(uint64_t)0x800000000000808aULL,
 	(uint64_t)0x8000000080008000ULL,
 	(uint64_t)0x000000000000808bULL,
 	(uint64_t)0x0000000080000001ULL,
 	(uint64_t)0x8000000080008081ULL,
 	(uint64_t)0x8000000000008009ULL,
 	(uint64_t)0x000000000000008aULL,
 	(uint64_t)0x0000000000000088ULL,
 	(uint64_t)0x0000000080008009ULL,
 	(uint64_t)0x000000008000000aULL,
 	(uint64_t)0x000000008000808bULL,
 	(uint64_t)0x800000000000008bULL,
 	(uint64_t)0x8000000000008089ULL,
 	(uint64_t)0x8000000000008003ULL,
 	(uint64_t)0x8000000000008002ULL,
 	(uint64_t)0x8000000000000080ULL,
 	(uint64_t)0x000000000000800aULL,
 	(uint64_t)0x800000008000000aULL,
 	(uint64_t)0x8000000080008081ULL,
 	(uint64_t)0x8000000000008080ULL,
 	(uint64_t)0x0000000080000001ULL,
 	(uint64_t)0x8000000080008008ULL
 };
 __constant__ uint64_t pdata64[10];
 __global__
 void crypto_hash(uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate)
 {
 	uint64_t Aba, Abe, Abi, Abo, Abu;
 	uint64_t Aga, Age, Agi, Ago, Agu;
 	uint64_t Aka, Ake, Aki, Ako, Aku;
 	uint64_t Ama, Ame, Ami, Amo, Amu;
 	uint64_t Asa, Ase, Asi, Aso, Asu;
 	uint64_t BCa, BCe, BCi, BCo, BCu;
 	uint64_t Da, De, Di, Do, Du;
 	uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
 	uint64_t Ega, Ege, Egi, Ego, Egu;
 	uint64_t Eka, Eke, Eki, Eko, Eku;
 	uint64_t Ema, Eme, Emi, Emo, Emu;
 	uint64_t Esa, Ese, Esi, Eso, Esu;
 	//copyFromState(A, state)
 	Aba = pdata64[0];
 	Abe = pdata64[1];
 	Abi = pdata64[2];
 	Abo = pdata64[3];
 	Abu = pdata64[4];
 	Aga = pdata64[5];
 	Age = pdata64[6];
 	Agi = pdata64[7];
 	Ago = pdata64[8];
 	Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32);
 	Aka = 0x0000000000000001ULL;
 	Ake = 0;
 	Aki = 0;
 	Ako = 0;
 	Aku = 0;
 	Ama = 0;
 	Ame = 0x8000000000000000ULL;
 	Ami = 0;
 	Amo = 0;
 	Amu = 0;
 	Asa = 0;
 	Ase = 0;
 	Asi = 0;
 	Aso = 0;
 	Asu = 0;
 #pragma unroll 12
 	for( int laneCount = 0; laneCount < 24; laneCount += 2 )
 	{
 		//    prepareTheta
 		BCa = Aba^Aga^Aka^Ama^Asa;
 		BCe = Abe^Age^Ake^Ame^Ase;
 		BCi = Abi^Agi^Aki^Ami^Asi;
 		BCo = Abo^Ago^Ako^Amo^Aso;
 		BCu = Abu^Agu^Aku^Amu^Asu;
 		//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
 		Da = BCu^ROL(BCe, 1);
 		De = BCa^ROL(BCi, 1);
 		Di = BCe^ROL(BCo, 1);
 		Do = BCi^ROL(BCu, 1);
 		Du = BCo^ROL(BCa, 1);
 		Aba ^= Da;
 		BCa = Aba;
 		Age ^= De;
 		BCe = ROL(Age, 44);
 		Aki ^= Di;
 		BCi = ROL(Aki, 43);
 		Amo ^= Do;
 		BCo = ROL(Amo, 21);
 		Asu ^= Du;
 		BCu = ROL(Asu, 14);
 		Eba =   BCa ^((~BCe)&  BCi );
 		Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount];
 		Ebe =   BCe ^((~BCi)&  BCo );
 		Ebi =   BCi ^((~BCo)&  BCu );
 		Ebo =   BCo ^((~BCu)&  BCa );
 		Ebu =   BCu ^((~BCa)&  BCe );
 		Abo ^= Do;
 		BCa = ROL(Abo, 28);
 		Agu ^= Du;
 		BCe = ROL(Agu, 20);
 		Aka ^= Da;
 		BCi = ROL(Aka,  3);
 		Ame ^= De;
 		BCo = ROL(Ame, 45);
 		Asi ^= Di;
 		BCu = ROL(Asi, 61);
 		Ega =   BCa ^((~BCe)&  BCi );
 		Ege =   BCe ^((~BCi)&  BCo );
 		Egi =   BCi ^((~BCo)&  BCu );
 		Ego =   BCo ^((~BCu)&  BCa );
 		Egu =   BCu ^((~BCa)&  BCe );
 		Abe ^= De;
 		BCa = ROL(Abe,  1);
 		Agi ^= Di;
 		BCe = ROL(Agi,  6);
 		Ako ^= Do;
 		BCi = ROL(Ako, 25);
 		Amu ^= Du;
 		BCo = ROL_mult8(Amu,  8);
 		Asa ^= Da;
 		BCu = ROL(Asa, 18);
 		Eka =   BCa ^((~BCe)&  BCi );
 		Eke =   BCe ^((~BCi)&  BCo );
 		Eki =   BCi ^((~BCo)&  BCu );
 		Eko =   BCo ^((~BCu)&  BCa );
 		Eku =   BCu ^((~BCa)&  BCe );
 		Abu ^= Du;
 		BCa = ROL(Abu, 27);
 		Aga ^= Da;
 		BCe = ROL(Aga, 36);
 		Ake ^= De;
 		BCi = ROL(Ake, 10);
 		Ami ^= Di;
 		BCo = ROL(Ami, 15);
 		Aso ^= Do;
 		BCu = ROL_mult8(Aso, 56);
 		Ema =   BCa ^((~BCe)&  BCi );
 		Eme =   BCe ^((~BCi)&  BCo );
 		Emi =   BCi ^((~BCo)&  BCu );
 		Emo =   BCo ^((~BCu)&  BCa );
 		Emu =   BCu ^((~BCa)&  BCe );
 		Abi ^= Di;
 		BCa = ROL(Abi, 62);
 		Ago ^= Do;
 		BCe = ROL(Ago, 55);
 		Aku ^= Du;
 		BCi = ROL(Aku, 39);
 		Ama ^= Da;
 		BCo = ROL(Ama, 41);
 		Ase ^= De;
 		BCu = ROL(Ase,  2);
 		Esa =   BCa ^((~BCe)&  BCi );
 		Ese =   BCe ^((~BCi)&  BCo );
 		Esi =   BCi ^((~BCo)&  BCu );
 		Eso =   BCo ^((~BCu)&  BCa );
 		Esu =   BCu ^((~BCa)&  BCe );
 		//    prepareTheta
 		BCa = Eba^Ega^Eka^Ema^Esa;
 		BCe = Ebe^Ege^Eke^Eme^Ese;
 		BCi = Ebi^Egi^Eki^Emi^Esi;
 		BCo = Ebo^Ego^Eko^Emo^Eso;
 		BCu = Ebu^Egu^Eku^Emu^Esu;
 		//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
 		Da = BCu^ROL(BCe, 1);
 		De = BCa^ROL(BCi, 1);
 		Di = BCe^ROL(BCo, 1);
 		Do = BCi^ROL(BCu, 1);
 		Du = BCo^ROL(BCa, 1);
 		Eba ^= Da;
 		BCa = Eba;
 		Ege ^= De;
 		BCe = ROL(Ege, 44);
 		Eki ^= Di;
 		BCi = ROL(Eki, 43);
 		Emo ^= Do;
 		BCo = ROL(Emo, 21);
 		Esu ^= Du;
 		BCu = ROL(Esu, 14);
 		Aba =   BCa ^((~BCe)&  BCi );
 		Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1];
 		Abe =   BCe ^((~BCi)&  BCo );
 		Abi =   BCi ^((~BCo)&  BCu );
 		Abo =   BCo ^((~BCu)&  BCa );
 		Abu =   BCu ^((~BCa)&  BCe );
 		Ebo ^= Do;
 		BCa = ROL(Ebo, 28);
 		Egu ^= Du;
 		BCe = ROL(Egu, 20);
 		Eka ^= Da;
 		BCi = ROL(Eka, 3);
 		Eme ^= De;
 		BCo = ROL(Eme, 45);
 		Esi ^= Di;
 		BCu = ROL(Esi, 61);
 		Aga =   BCa ^((~BCe)&  BCi );
 		Age =   BCe ^((~BCi)&  BCo );
 		Agi =   BCi ^((~BCo)&  BCu );
 		Ago =   BCo ^((~BCu)&  BCa );
 		Agu =   BCu ^((~BCa)&  BCe );
 		Ebe ^= De;
 		BCa = ROL(Ebe, 1);
 		Egi ^= Di;
 		BCe = ROL(Egi, 6);
 		Eko ^= Do;
 		BCi = ROL(Eko, 25);
 		Emu ^= Du;
 		BCo = ROL_mult8(Emu, 8);
 		Esa ^= Da;
 		BCu = ROL(Esa, 18);
 		Aka =   BCa ^((~BCe)&  BCi );
 		Ake =   BCe ^((~BCi)&  BCo );
 		Aki =   BCi ^((~BCo)&  BCu );
 		Ako =   BCo ^((~BCu)&  BCa );
 		Aku =   BCu ^((~BCa)&  BCe );
 		Ebu ^= Du;
 		BCa = ROL(Ebu, 27);
 		Ega ^= Da;
 		BCe = ROL(Ega, 36);
 		Eke ^= De;
 		BCi = ROL(Eke, 10);
 		Emi ^= Di;
 		BCo = ROL(Emi, 15);
 		Eso ^= Do;
 		BCu = ROL_mult8(Eso, 56);
 		Ama =   BCa ^((~BCe)&  BCi );
 		Ame =   BCe ^((~BCi)&  BCo );
 		Ami =   BCi ^((~BCo)&  BCu );
 		Amo =   BCo ^((~BCu)&  BCa );
 		Amu =   BCu ^((~BCa)&  BCe );
 		Ebi ^= Di;
 		BCa = ROL(Ebi, 62);
 		Ego ^= Do;
 		BCe = ROL(Ego, 55);
 		Eku ^= Du;
 		BCi = ROL(Eku, 39);
 		Ema ^= Da;
 		BCo = ROL(Ema, 41);
 		Ese ^= De;
 		BCu = ROL(Ese, 2);
 		Asa =   BCa ^((~BCe)&  BCi );
 		Ase =   BCe ^((~BCi)&  BCo );
 		Asi =   BCi ^((~BCo)&  BCu );
 		Aso =   BCo ^((~BCu)&  BCa );
 		Asu =   BCu ^((~BCa)&  BCe );
 	}
 	if (validate) {
 		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
 		g_out[3] = Abo;
 		g_out[2] = Abi;
 		g_out[1] = Abe;
 		g_out[0] = Aba;
 	}
 	// the likelyhood of meeting the hashing target is so low, that we're not guarding this
 	// with atomic writes, locks or similar...
 	uint64_t *g_good64 = (uint64_t*)g_good;
 	if (Abo <=  ptarget64[3]) {
 		if (Abo < g_good64[3]) {
 			g_good64[3] = Abo;
 			g_good64[2] = Abi;
 			g_good64[1] = Abe;
 			g_good64[0] = Aba;
 			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
 		}
 	}
 }
 static std::map<int, uint32_t *> context_good[2];
 bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
 {
 	static bool init[MAX_GPUS] = { 0 };
 	if (!init[thr_id])
 	{
 		checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice));
 		// allocate pinned host memory for good hashes
 		uint32_t *tmp;
 		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
 		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
 		init[thr_id] = true;
 	}
 	checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	return context_good[0][thr_id] && context_good[1][thr_id];
 }
 void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
 {
 	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
 	crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
 	// copy hashes from device memory to host (ALL hashes, lots of data...)
 	if (do_d2h && hash != NULL) {
 		size_t mem_size = throughput * sizeof(uint32_t) * 8;
 		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
 						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 	}
 	else if (hash != NULL) {
 		// asynchronous copy of winning nonce (just 4 bytes...)
 		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
 						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 	}
 }
--- a/scrypt/nv_kernel.cu
+++ b/scrypt/nv_kernel.cu
@ -708,784 +708,3 @@ void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned i
 	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
 }
 //
 // Maxcoin related Keccak implementation (Keccak256)
 //
 // from salsa_kernel.cu
 extern std::map<int, int> context_blocks;
 extern std::map<int, int> context_wpb;
 extern std::map<int, KernelInterface *> context_kernel;
 extern std::map<int, cudaStream_t> context_streams[2];
 extern std::map<int, uint32_t *> context_hash[2];
 __constant__ uint64_t ptarget64[4];
 #define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64))))
 #define ROL_mult8(a, offset) ROL(a, offset)
 __constant__ uint64_t KeccakF_RoundConstants[24];
 static uint64_t host_KeccakF_RoundConstants[24] = {
 	(uint64_t)0x0000000000000001ULL,
 	(uint64_t)0x0000000000008082ULL,
 	(uint64_t)0x800000000000808aULL,
 	(uint64_t)0x8000000080008000ULL,
 	(uint64_t)0x000000000000808bULL,
 	(uint64_t)0x0000000080000001ULL,
 	(uint64_t)0x8000000080008081ULL,
 	(uint64_t)0x8000000000008009ULL,
 	(uint64_t)0x000000000000008aULL,
 	(uint64_t)0x0000000000000088ULL,
 	(uint64_t)0x0000000080008009ULL,
 	(uint64_t)0x000000008000000aULL,
 	(uint64_t)0x000000008000808bULL,
 	(uint64_t)0x800000000000008bULL,
 	(uint64_t)0x8000000000008089ULL,
 	(uint64_t)0x8000000000008003ULL,
 	(uint64_t)0x8000000000008002ULL,
 	(uint64_t)0x8000000000000080ULL,
 	(uint64_t)0x000000000000800aULL,
 	(uint64_t)0x800000008000000aULL,
 	(uint64_t)0x8000000080008081ULL,
 	(uint64_t)0x8000000000008080ULL,
 	(uint64_t)0x0000000080000001ULL,
 	(uint64_t)0x8000000080008008ULL
 };
 __constant__ uint64_t pdata64[10];
 static __device__ uint32_t cuda_swab32(uint32_t x)
 {
 	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
 		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
 }
 __global__
 void kepler_crypto_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate )
 {
 	uint64_t Aba, Abe, Abi, Abo, Abu;
 	uint64_t Aga, Age, Agi, Ago, Agu;
 	uint64_t Aka, Ake, Aki, Ako, Aku;
 	uint64_t Ama, Ame, Ami, Amo, Amu;
 	uint64_t Asa, Ase, Asi, Aso, Asu;
 	uint64_t BCa, BCe, BCi, BCo, BCu;
 	uint64_t Da, De, Di, Do, Du;
 	uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
 	uint64_t Ega, Ege, Egi, Ego, Egu;
 	uint64_t Eka, Eke, Eki, Eko, Eku;
 	uint64_t Ema, Eme, Emi, Emo, Emu;
 	uint64_t Esa, Ese, Esi, Eso, Esu;
 	//copyFromState(A, state)
 	Aba = pdata64[0];
 	Abe = pdata64[1];
 	Abi = pdata64[2];
 	Abo = pdata64[3];
 	Abu = pdata64[4];
 	Aga = pdata64[5];
 	Age = pdata64[6];
 	Agi = pdata64[7];
 	Ago = pdata64[8];
 	Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32);
 	Aka = 0x0000000000000001ULL;
 	Ake = 0;
 	Aki = 0;
 	Ako = 0;
 	Aku = 0;
 	Ama = 0;
 	Ame = 0x8000000000000000ULL;
 	Ami = 0;
 	Amo = 0;
 	Amu = 0;
 	Asa = 0;
 	Ase = 0;
 	Asi = 0;
 	Aso = 0;
 	Asu = 0;
 #pragma unroll 12
 	for( int laneCount = 0; laneCount < 24; laneCount += 2 )
 	{
 		//    prepareTheta
 		BCa = Aba^Aga^Aka^Ama^Asa;
 		BCe = Abe^Age^Ake^Ame^Ase;
 		BCi = Abi^Agi^Aki^Ami^Asi;
 		BCo = Abo^Ago^Ako^Amo^Aso;
 		BCu = Abu^Agu^Aku^Amu^Asu;
 		//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
 		Da = BCu^ROL(BCe, 1);
 		De = BCa^ROL(BCi, 1);
 		Di = BCe^ROL(BCo, 1);
 		Do = BCi^ROL(BCu, 1);
 		Du = BCo^ROL(BCa, 1);
 		Aba ^= Da;
 		BCa = Aba;
 		Age ^= De;
 		BCe = ROL(Age, 44);
 		Aki ^= Di;
 		BCi = ROL(Aki, 43);
 		Amo ^= Do;
 		BCo = ROL(Amo, 21);
 		Asu ^= Du;
 		BCu = ROL(Asu, 14);
 		Eba =   BCa ^((~BCe)&  BCi );
 		Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount];
 		Ebe =   BCe ^((~BCi)&  BCo );
 		Ebi =   BCi ^((~BCo)&  BCu );
 		Ebo =   BCo ^((~BCu)&  BCa );
 		Ebu =   BCu ^((~BCa)&  BCe );
 		Abo ^= Do;
 		BCa = ROL(Abo, 28);
 		Agu ^= Du;
 		BCe = ROL(Agu, 20);
 		Aka ^= Da;
 		BCi = ROL(Aka,  3);
 		Ame ^= De;
 		BCo = ROL(Ame, 45);
 		Asi ^= Di;
 		BCu = ROL(Asi, 61);
 		Ega =   BCa ^((~BCe)&  BCi );
 		Ege =   BCe ^((~BCi)&  BCo );
 		Egi =   BCi ^((~BCo)&  BCu );
 		Ego =   BCo ^((~BCu)&  BCa );
 		Egu =   BCu ^((~BCa)&  BCe );
 		Abe ^= De;
 		BCa = ROL(Abe,  1);
 		Agi ^= Di;
 		BCe = ROL(Agi,  6);
 		Ako ^= Do;
 		BCi = ROL(Ako, 25);
 		Amu ^= Du;
 		BCo = ROL_mult8(Amu,  8);
 		Asa ^= Da;
 		BCu = ROL(Asa, 18);
 		Eka =   BCa ^((~BCe)&  BCi );
 		Eke =   BCe ^((~BCi)&  BCo );
 		Eki =   BCi ^((~BCo)&  BCu );
 		Eko =   BCo ^((~BCu)&  BCa );
 		Eku =   BCu ^((~BCa)&  BCe );
 		Abu ^= Du;
 		BCa = ROL(Abu, 27);
 		Aga ^= Da;
 		BCe = ROL(Aga, 36);
 		Ake ^= De;
 		BCi = ROL(Ake, 10);
 		Ami ^= Di;
 		BCo = ROL(Ami, 15);
 		Aso ^= Do;
 		BCu = ROL_mult8(Aso, 56);
 		Ema =   BCa ^((~BCe)&  BCi );
 		Eme =   BCe ^((~BCi)&  BCo );
 		Emi =   BCi ^((~BCo)&  BCu );
 		Emo =   BCo ^((~BCu)&  BCa );
 		Emu =   BCu ^((~BCa)&  BCe );
 		Abi ^= Di;
 		BCa = ROL(Abi, 62);
 		Ago ^= Do;
 		BCe = ROL(Ago, 55);
 		Aku ^= Du;
 		BCi = ROL(Aku, 39);
 		Ama ^= Da;
 		BCo = ROL(Ama, 41);
 		Ase ^= De;
 		BCu = ROL(Ase,  2);
 		Esa =   BCa ^((~BCe)&  BCi );
 		Ese =   BCe ^((~BCi)&  BCo );
 		Esi =   BCi ^((~BCo)&  BCu );
 		Eso =   BCo ^((~BCu)&  BCa );
 		Esu =   BCu ^((~BCa)&  BCe );
 		//    prepareTheta
 		BCa = Eba^Ega^Eka^Ema^Esa;
 		BCe = Ebe^Ege^Eke^Eme^Ese;
 		BCi = Ebi^Egi^Eki^Emi^Esi;
 		BCo = Ebo^Ego^Eko^Emo^Eso;
 		BCu = Ebu^Egu^Eku^Emu^Esu;
 		//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
 		Da = BCu^ROL(BCe, 1);
 		De = BCa^ROL(BCi, 1);
 		Di = BCe^ROL(BCo, 1);
 		Do = BCi^ROL(BCu, 1);
 		Du = BCo^ROL(BCa, 1);
 		Eba ^= Da;
 		BCa = Eba;
 		Ege ^= De;
 		BCe = ROL(Ege, 44);
 		Eki ^= Di;
 		BCi = ROL(Eki, 43);
 		Emo ^= Do;
 		BCo = ROL(Emo, 21);
 		Esu ^= Du;
 		BCu = ROL(Esu, 14);
 		Aba =   BCa ^((~BCe)&  BCi );
 		Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1];
 		Abe =   BCe ^((~BCi)&  BCo );
 		Abi =   BCi ^((~BCo)&  BCu );
 		Abo =   BCo ^((~BCu)&  BCa );
 		Abu =   BCu ^((~BCa)&  BCe );
 		Ebo ^= Do;
 		BCa = ROL(Ebo, 28);
 		Egu ^= Du;
 		BCe = ROL(Egu, 20);
 		Eka ^= Da;
 		BCi = ROL(Eka, 3);
 		Eme ^= De;
 		BCo = ROL(Eme, 45);
 		Esi ^= Di;
 		BCu = ROL(Esi, 61);
 		Aga =   BCa ^((~BCe)&  BCi );
 		Age =   BCe ^((~BCi)&  BCo );
 		Agi =   BCi ^((~BCo)&  BCu );
 		Ago =   BCo ^((~BCu)&  BCa );
 		Agu =   BCu ^((~BCa)&  BCe );
 		Ebe ^= De;
 		BCa = ROL(Ebe, 1);
 		Egi ^= Di;
 		BCe = ROL(Egi, 6);
 		Eko ^= Do;
 		BCi = ROL(Eko, 25);
 		Emu ^= Du;
 		BCo = ROL_mult8(Emu, 8);
 		Esa ^= Da;
 		BCu = ROL(Esa, 18);
 		Aka =   BCa ^((~BCe)&  BCi );
 		Ake =   BCe ^((~BCi)&  BCo );
 		Aki =   BCi ^((~BCo)&  BCu );
 		Ako =   BCo ^((~BCu)&  BCa );
 		Aku =   BCu ^((~BCa)&  BCe );
 		Ebu ^= Du;
 		BCa = ROL(Ebu, 27);
 		Ega ^= Da;
 		BCe = ROL(Ega, 36);
 		Eke ^= De;
 		BCi = ROL(Eke, 10);
 		Emi ^= Di;
 		BCo = ROL(Emi, 15);
 		Eso ^= Do;
 		BCu = ROL_mult8(Eso, 56);
 		Ama =   BCa ^((~BCe)&  BCi );
 		Ame =   BCe ^((~BCi)&  BCo );
 		Ami =   BCi ^((~BCo)&  BCu );
 		Amo =   BCo ^((~BCu)&  BCa );
 		Amu =   BCu ^((~BCa)&  BCe );
 		Ebi ^= Di;
 		BCa = ROL(Ebi, 62);
 		Ego ^= Do;
 		BCe = ROL(Ego, 55);
 		Eku ^= Du;
 		BCi = ROL(Eku, 39);
 		Ema ^= Da;
 		BCo = ROL(Ema, 41);
 		Ese ^= De;
 		BCu = ROL(Ese, 2);
 		Asa =   BCa ^((~BCe)&  BCi );
 		Ase =   BCe ^((~BCi)&  BCo );
 		Asi =   BCi ^((~BCo)&  BCu );
 		Aso =   BCo ^((~BCu)&  BCa );
 		Asu =   BCu ^((~BCa)&  BCe );
 	}
 	if (validate) {
 		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
 		g_out[3] = Abo;
 		g_out[2] = Abi;
 		g_out[1] = Abe;
 		g_out[0] = Aba;
 	}
 	// the likelyhood of meeting the hashing target is so low, that we're not guarding this
 	// with atomic writes, locks or similar...
 	uint64_t *g_good64 = (uint64_t*)g_good;
 	if (Abo <=  ptarget64[3]) {
 		if (Abo < g_good64[3]) {
 			g_good64[3] = Abo;
 			g_good64[2] = Abi;
 			g_good64[1] = Abe;
 			g_good64[0] = Aba;
 			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
 		}
 	}
 }
 static std::map<int, uint32_t *> context_good[2];
 bool NVKernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
 {
 	static bool init[MAX_GPUS] = { 0 };
 	if (!init[thr_id])
 	{
 		checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice));
 		// allocate pinned host memory for good hashes
 		uint32_t *tmp;
 		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
 		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
 		init[thr_id] = true;
 	}
 	checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	return context_good[0][thr_id] && context_good[1][thr_id];
 }
 void NVKernel::do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
 {
 	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
 	kepler_crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
 	// copy hashes from device memory to host (ALL hashes, lots of data...)
 	if (do_d2h && hash != NULL) {
 		size_t mem_size = throughput * sizeof(uint32_t) * 8;
 		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
 						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 	}
 	else if (hash != NULL) {
 		// asynchronous copy of winning nonce (just 4 bytes...)
 		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
 						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 	}
 }
 //
 // Blakecoin related Keccak implementation (Keccak256)
 //
 typedef uint32_t sph_u32;
 #define SPH_C32(x) ((sph_u32)(x))
 #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
 #if __CUDA_ARCH__ < 350
 	// Kepler (Compute 3.0)
 	#define SPH_ROTL32(a, b) ((a)<<(b))|((a)>>(32-(b)))
 #else
 	// Kepler (Compute 3.5)
 	#define SPH_ROTL32(a, b) __funnelshift_l( a, a, b );
 #endif
 #define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
 __constant__ uint32_t pdata[20];
 #ifdef _MSC_VER
 #pragma warning (disable: 4146)
 #endif
 static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x)
 {
 	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
 		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
 }
 /**
 * Encode a 32-bit value into the provided buffer (big endian convention).
 *
 * @param dst   the destination buffer
 * @param val   the 32-bit value to encode
 */
 static __device__ void
 cuda_sph_enc32be(void *dst, sph_u32 val)
 {
 	*(sph_u32 *)dst = cuda_sph_bswap32(val);
 }
 #define Z00   0
 #define Z01   1
 #define Z02   2
 #define Z03   3
 #define Z04   4
 #define Z05   5
 #define Z06   6
 #define Z07   7
 #define Z08   8
 #define Z09   9
 #define Z0A   A
 #define Z0B   B
 #define Z0C   C
 #define Z0D   D
 #define Z0E   E
 #define Z0F   F
 #define Z10   E
 #define Z11   A
 #define Z12   4
 #define Z13   8
 #define Z14   9
 #define Z15   F
 #define Z16   D
 #define Z17   6
 #define Z18   1
 #define Z19   C
 #define Z1A   0
 #define Z1B   2
 #define Z1C   B
 #define Z1D   7
 #define Z1E   5
 #define Z1F   3
 #define Z20   B
 #define Z21   8
 #define Z22   C
 #define Z23   0
 #define Z24   5
 #define Z25   2
 #define Z26   F
 #define Z27   D
 #define Z28   A
 #define Z29   E
 #define Z2A   3
 #define Z2B   6
 #define Z2C   7
 #define Z2D   1
 #define Z2E   9
 #define Z2F   4
 #define Z30   7
 #define Z31   9
 #define Z32   3
 #define Z33   1
 #define Z34   D
 #define Z35   C
 #define Z36   B
 #define Z37   E
 #define Z38   2
 #define Z39   6
 #define Z3A   5
 #define Z3B   A
 #define Z3C   4
 #define Z3D   0
 #define Z3E   F
 #define Z3F   8
 #define Z40   9
 #define Z41   0
 #define Z42   5
 #define Z43   7
 #define Z44   2
 #define Z45   4
 #define Z46   A
 #define Z47   F
 #define Z48   E
 #define Z49   1
 #define Z4A   B
 #define Z4B   C
 #define Z4C   6
 #define Z4D   8
 #define Z4E   3
 #define Z4F   D
 #define Z50   2
 #define Z51   C
 #define Z52   6
 #define Z53   A
 #define Z54   0
 #define Z55   B
 #define Z56   8
 #define Z57   3
 #define Z58   4
 #define Z59   D
 #define Z5A   7
 #define Z5B   5
 #define Z5C   F
 #define Z5D   E
 #define Z5E   1
 #define Z5F   9
 #define Z60   C
 #define Z61   5
 #define Z62   1
 #define Z63   F
 #define Z64   E
 #define Z65   D
 #define Z66   4
 #define Z67   A
 #define Z68   0
 #define Z69   7
 #define Z6A   6
 #define Z6B   3
 #define Z6C   9
 #define Z6D   2
 #define Z6E   8
 #define Z6F   B
 #define Z70   D
 #define Z71   B
 #define Z72   7
 #define Z73   E
 #define Z74   C
 #define Z75   1
 #define Z76   3
 #define Z77   9
 #define Z78   5
 #define Z79   0
 #define Z7A   F
 #define Z7B   4
 #define Z7C   8
 #define Z7D   6
 #define Z7E   2
 #define Z7F   A
 #define Z80   6
 #define Z81   F
 #define Z82   E
 #define Z83   9
 #define Z84   B
 #define Z85   3
 #define Z86   0
 #define Z87   8
 #define Z88   C
 #define Z89   2
 #define Z8A   D
 #define Z8B   7
 #define Z8C   1
 #define Z8D   4
 #define Z8E   A
 #define Z8F   5
 #define Z90   A
 #define Z91   2
 #define Z92   8
 #define Z93   4
 #define Z94   7
 #define Z95   6
 #define Z96   1
 #define Z97   5
 #define Z98   F
 #define Z99   B
 #define Z9A   9
 #define Z9B   E
 #define Z9C   3
 #define Z9D   C
 #define Z9E   D
 #define Z9F   0
 #define Mx(r, i)    Mx_(Z ## r ## i)
 #define Mx_(n)      Mx__(n)
 #define Mx__(n)     M ## n
 #define CSx(r, i)   CSx_(Z ## r ## i)
 #define CSx_(n)     CSx__(n)
 #define CSx__(n)    CS ## n
 #define CS0   SPH_C32(0x243F6A88)
 #define CS1   SPH_C32(0x85A308D3)
 #define CS2   SPH_C32(0x13198A2E)
 #define CS3   SPH_C32(0x03707344)
 #define CS4   SPH_C32(0xA4093822)
 #define CS5   SPH_C32(0x299F31D0)
 #define CS6   SPH_C32(0x082EFA98)
 #define CS7   SPH_C32(0xEC4E6C89)
 #define CS8   SPH_C32(0x452821E6)
 #define CS9   SPH_C32(0x38D01377)
 #define CSA   SPH_C32(0xBE5466CF)
 #define CSB   SPH_C32(0x34E90C6C)
 #define CSC   SPH_C32(0xC0AC29B7)
 #define CSD   SPH_C32(0xC97C50DD)
 #define CSE   SPH_C32(0x3F84D5B5)
 #define CSF   SPH_C32(0xB5470917)
 #define GS(m0, m1, c0, c1, a, b, c, d)   do { \
 		a = SPH_T32(a + b + (m0 ^ c1)); \
 		d = SPH_ROTR32(d ^ a, 16); \
 		c = SPH_T32(c + d); \
 		b = SPH_ROTR32(b ^ c, 12); \
 		a = SPH_T32(a + b + (m1 ^ c0)); \
 		d = SPH_ROTR32(d ^ a, 8); \
 		c = SPH_T32(c + d); \
 		b = SPH_ROTR32(b ^ c, 7); \
 	} while (0)
 #define ROUND_S(r)   do { \
 		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
 		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
 		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
 		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
 		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
 		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
 		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
 		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
 	} while (0)
 #define COMPRESS32   do { \
 		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
 		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
 		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
 		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
 		V0 = H0; \
 		V1 = H1; \
 		V2 = H2; \
 		V3 = H3; \
 		V4 = H4; \
 		V5 = H5; \
 		V6 = H6; \
 		V7 = H7; \
 		V8 = S0 ^ CS0; \
 		V9 = S1 ^ CS1; \
 		VA = S2 ^ CS2; \
 		VB = S3 ^ CS3; \
 		VC = T0 ^ CS4; \
 		VD = T0 ^ CS5; \
 		VE = T1 ^ CS6; \
 		VF = T1 ^ CS7; \
 		M0 = input[0]; \
 		M1 = input[1]; \
 		M2 = input[2]; \
 		M3 = input[3]; \
 		M4 = input[4]; \
 		M5 = input[5]; \
 		M6 = input[6]; \
 		M7 = input[7]; \
 		M8 = input[8]; \
 		M9 = input[9]; \
 		MA = input[10]; \
 		MB = input[11]; \
 		MC = input[12]; \
 		MD = input[13]; \
 		ME = input[14]; \
 		MF = input[15]; \
 		ROUND_S(0); \
 		ROUND_S(1); \
 		ROUND_S(2); \
 		ROUND_S(3); \
 		ROUND_S(4); \
 		ROUND_S(5); \
 		ROUND_S(6); \
 		ROUND_S(7); \
 		H0 ^= S0 ^ V0 ^ V8; \
 		H1 ^= S1 ^ V1 ^ V9; \
 		H2 ^= S2 ^ V2 ^ VA; \
 		H3 ^= S3 ^ V3 ^ VB; \
 		H4 ^= S0 ^ V4 ^ VC; \
 		H5 ^= S1 ^ V5 ^ VD; \
 		H6 ^= S2 ^ V6 ^ VE; \
 		H7 ^= S3 ^ V7 ^ VF; \
 	} while (0)
 __global__
 void kepler_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate)
 {
 	uint32_t input[16];
 	uint64_t output[4];
 	#pragma unroll
 	for (int i=0; i < 16; ++i) input[i] = pdata[i];
 	sph_u32 H0 = 0x6A09E667;
 	sph_u32 H1 = 0xBB67AE85;
 	sph_u32 H2 = 0x3C6EF372;
 	sph_u32 H3 = 0xA54FF53A;
 	sph_u32 H4 = 0x510E527F;
 	sph_u32 H5 = 0x9B05688C;
 	sph_u32 H6 = 0x1F83D9AB;
 	sph_u32 H7 = 0x5BE0CD19;
 	sph_u32 S0 = 0;
 	sph_u32 S1 = 0;
 	sph_u32 S2 = 0;
 	sph_u32 S3 = 0;
 	sph_u32 T0 = 0;
 	sph_u32 T1 = 0;
 	T0 = SPH_T32(T0 + 512);
 	COMPRESS32;
 	#pragma unroll
 	for (int i=0; i < 3; ++i) input[i] = pdata[16+i];
 	input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
 	input[4] = 0x80000000;
 	#pragma unroll 8
 	for (int i=5; i < 13; ++i) input[i] = 0;
 	input[13] = 0x00000001;
 	input[14] = T1;
 	input[15] = T0 + 128;
 	T0 = SPH_T32(T0 + 128);
 	COMPRESS32;
 	cuda_sph_enc32be((unsigned char*)output + 4*6, H6);
 	cuda_sph_enc32be((unsigned char*)output + 4*7, H7);
 	if (validate || output[3] <=  ptarget64[3])
 	{
 		// this data is only needed when we actually need to save the hashes
 		cuda_sph_enc32be((unsigned char*)output + 4*0, H0);
 		cuda_sph_enc32be((unsigned char*)output + 4*1, H1);
 		cuda_sph_enc32be((unsigned char*)output + 4*2, H2);
 		cuda_sph_enc32be((unsigned char*)output + 4*3, H3);
 		cuda_sph_enc32be((unsigned char*)output + 4*4, H4);
 		cuda_sph_enc32be((unsigned char*)output + 4*5, H5);
 	}
 	if (validate)
 	{
 		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
 		#pragma unroll
 		for (int i=0; i < 4; ++i) g_out[i] = output[i];
 	}
 	if (output[3] <=  ptarget64[3]) {
 		uint64_t *g_good64 = (uint64_t*)g_good;
 		if (output[3] < g_good64[3]) {
 			g_good64[3] = output[3];
 			g_good64[2] = output[2];
 			g_good64[1] = output[1];
 			g_good64[0] = output[0];
 			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
 		}
 	}
 }
 bool NVKernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
 {
 	static bool init[MAX_GPUS] = { 0 };
 	if (!init[thr_id])
 	{
 		// allocate pinned host memory for good hashes
 		uint32_t *tmp;
 		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
 		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
 		init[thr_id] = true;
 	}
 	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	return context_good[0][thr_id] && context_good[1][thr_id];
 }
 void NVKernel::do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
 {
 	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
 	kepler_blake256_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
 	// copy hashes from device memory to host (ALL hashes, lots of data...)
 	if (do_d2h && hash != NULL) {
 		size_t mem_size = throughput * sizeof(uint32_t) * 8;
 		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
 						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 	}
 	else if (hash != NULL) {
 		// asynchronous copy of winning nonce (just 4 bytes...)
 		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
 						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
 	}
 }
--- a/scrypt/nv_kernel.h
+++ b/scrypt/nv_kernel.h
@ -25,12 +25,6 @@ public:
 	virtual bool support_lookup_gap() { return true; }
 	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
 	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
 	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
 	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
 	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]);
 	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
 };
 #endif // #ifndef NV_KERNEL_H
--- a/scrypt/nv_kernel2.cu
+++ b/scrypt/nv_kernel2.cu
--- a/scrypt/nv_kernel2.h
+++ b/scrypt/nv_kernel2.h
@ -25,12 +25,6 @@ public:
 	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
 	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
 	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
 	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
 	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]);
 	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
 };
 #endif // #ifndef NV2_KERNEL_H
--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@ -821,44 +821,6 @@ void cuda_scrypt_core(int thr_id, int stream, unsigned int N)
 	);
 }
 bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
 {
 	return context_kernel[thr_id]->prepare_keccak256(thr_id, host_pdata, ptarget);
 }
 #if 0
 void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
 {
 	unsigned int GRID_BLOCKS = context_blocks[thr_id];
 	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
 	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
 	// setup execution parameters
 	dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
 	dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
 	context_kernel[thr_id]->do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
 }
 #endif
 bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
 {
 	return context_kernel[thr_id]->prepare_blake256(thr_id, host_pdata, ptarget);
 }
 #if 0
 void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
 {
 	unsigned int GRID_BLOCKS = context_blocks[thr_id];
 	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
 	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
 	// setup execution parameters
 	dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
 	dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
 	context_kernel[thr_id]->do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
 }
 #endif
 void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA)
 {
 	unsigned int GRID_BLOCKS = context_blocks[thr_id];
--- a/scrypt/salsa_kernel.h
+++ b/scrypt/salsa_kernel.h
@ -58,20 +58,6 @@ extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA);
 extern bool cuda_scrypt_sync(int thr_id, int stream);
 extern void cuda_scrypt_flush(int thr_id, int stream);
 extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
 extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
 extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
 extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
 extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
 extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
 #ifdef __NVCC__
 extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
 extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
 #endif
 // If we're in C++ mode, we're either compiling .cu files or scrypt.cpp
 #ifdef __NVCC__
@ -101,20 +87,6 @@ public:
 	virtual bool support_lookup_gap() { return false; }
 	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; }
 	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; }
 	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
 		return default_prepare_keccak256(thr_id, host_pdata, ptarget);
 	}
 	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
 		default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
 	}
 	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
 		return default_prepare_blake256(thr_id, host_pdata, ptarget);
 	}
 	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
 		default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
 	}
 };
 // Not performing error checking is actually bad, but...