scrypt: strip keccak/blake 256 remains

2025-03-09 12:01:09 +00:00 · 2015-04-21 17:43:12 +02:00 · 2015-04-21 17:43:12 +02:00 · 22c28ccbef
commit 22c28ccbef
parent a0c8bd8be4
9 changed files with 15 additions and 2323 deletions
--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@ -240,13 +240,12 @@ static void scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac)
 *  - mikaelh
 */
 static void scrypt_pbkdf2_1(const uint8_t *password, size_t password_len,
-	const uint8_t *salt, size_t salt_len, uint8_t *out, size_t bytes)
+	const uint8_t *salt, size_t salt_len, uint8_t *out, uint64_t bytes)
 {
 	scrypt_hmac_state hmac_pw, hmac_pw_salt, work;
 	scrypt_hash_digest ti, u;
 	uint8_t be[4];
-	uint32_t i, /*j,*/ blocks;
-//	uint64_t c;
+	uint32_t i, blocks;

 	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */

@ -266,7 +265,7 @@ static void scrypt_pbkdf2_1(const uint8_t *password, size_t password_len,
 		scrypt_hmac_finish(&work, ti);
 		memcpy(u, ti, sizeof(u));

-		memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes);
+		memcpy(out, ti, (size_t) (bytes > SCRYPT_HASH_DIGEST_SIZE ? SCRYPT_HASH_DIGEST_SIZE : bytes));
 		out += SCRYPT_HASH_DIGEST_SIZE;
 		bytes -= SCRYPT_HASH_DIGEST_SIZE;
 	}
@ -631,7 +630,7 @@ int scanhash_scrypt_jane(int thr_id, uint32_t *pdata, const uint32_t *ptarget, u


 static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, const uchar*salt, size_t salt_len, uint32_t N,
-	uchar *out, size_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V)
+	uchar *out, uint32_t bytes, uint8_t *X, uint8_t *Y, uint8_t *V)
 {
 	uint32_t chunk_bytes, i;
 	const uint32_t p = SCRYPT_P;
@ -650,7 +649,7 @@ static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, con
 		scrypt_ROMix_1((scrypt_mix_word_t *)(X + (chunk_bytes * i)), (scrypt_mix_word_t *)Y, (scrypt_mix_word_t *)V, N);

 	/* 3: Out = PBKDF2(password, X) */
-	scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, bytes);
+	scrypt_pbkdf2_1(password, password_len, X, chunk_bytes * p, out, (size_t) bytes);

 #ifdef SCRYPT_PREVENT_STATE_LEAK
 	/* This is an unnecessary security feature - mikaelh */
@ -661,7 +660,7 @@ static void scrypt_jane_hash_1_1(const uchar *password, size_t password_len, con
 /* for cpu hash test */
 void scryptjane_hash(void* output, const void* input)
 {
-	uint64_t Nsize = 1ULL << (opt_nfactor + 1);
+	uint32_t Nsize = 1UL << (opt_nfactor + 1);
 	uint64_t chunk_bytes;
 	uint8_t *X, *Y;
 	scrypt_aligned_alloc YX, V;
@ -670,12 +669,12 @@ void scryptjane_hash(void* output, const void* input)
 	V  = scrypt_alloc(Nsize * chunk_bytes);
 	YX = scrypt_alloc((SCRYPT_P + 1) * chunk_bytes);

-	memset(V.ptr, 0, Nsize * chunk_bytes);
+	memset(V.ptr, 0, (size_t) (Nsize * chunk_bytes));

 	Y = YX.ptr;
 	X = Y + chunk_bytes;

-	scrypt_jane_hash_1_1((uchar*)input, 80, (uchar*)input, 80, Nsize, (uchar*)output, 32, X, Y, V.ptr);
+	scrypt_jane_hash_1_1((uchar*)input, 80, (uchar*)input, 80, (uint32_t) Nsize, (uchar*)output, 32, X, Y, V.ptr);

 	scrypt_free(&V);
 	scrypt_free(&YX);
--- a/scrypt.cpp
+++ b/scrypt.cpp
@ -994,12 +994,12 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C)
 */
 static void scrypt_core(uint32_t *X, uint32_t *V, uint32_t N)
 {
-	for (int i = 0; i < N; i++) {
+	for (uint32_t i = 0; i < N; i++) {
 		memcpy(&V[i * 32], X, 128);
 		xor_salsa8(&X[0], &X[16]);
 		xor_salsa8(&X[16], &X[0]);
 	}
-	for (int i = 0; i < N; i++) {
+	for (uint32_t i = 0; i < N; i++) {
 		uint32_t j = 32 * (X[16] & (N - 1));
 		for (uint8_t k = 0; k < 32; k++)
 			X[k] ^= V[j + k];
--- a/scrypt/keccak.cu
+++ b/scrypt/keccak.cu
@ -4,21 +4,16 @@
 // The keccak512 (SHA-3) is used in the PBKDF2 for scrypt-jane coins
 // in place of the SHA2 based PBKDF2 used in scrypt coins.
 //
-// The keccak256 is used exclusively in Maxcoin and clones. This module
-// holds the generic "default" implementation when no architecture
-// specific implementation is available in the kernel.
-//
-// NOTE: compile this .cu module for compute_10,sm_10 with --maxrregcount=64
+// NOTE: compile this .cu module for compute_20,sm_20 with --maxrregcount=64
 //

 #include <map>
-#include <stdint.h>

-#include "salsa_kernel.h"
-#include "cuda_runtime.h"
 #include "miner.h"
+#include "cuda_helper.h"

 #include "keccak.h"
+#include "salsa_kernel.h"

 // define some error checking macros
 #undef checkCudaErrors
@ -45,7 +40,9 @@ extern std::map<int, uint32_t *> context_odata[2];
 extern std::map<int, cudaStream_t> context_streams[2];
 extern std::map<int, uint32_t *> context_hash[2];

+#ifndef ROTL64
 #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+#endif

 // CB
 #define U32TO64_LE(p) \
@ -375,11 +372,6 @@ __device__ void pbkdf2_statecopy8(pbkdf2_hmac_state *d, pbkdf2_hmac_state *s) {

 // ---------------------------- END PBKDF2 functions ------------------------------------

-static __device__ uint32_t cuda_swab32(uint32_t x) {
-	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
-		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
-}
-
 __global__ __launch_bounds__(128)
 void cuda_pre_keccak512(uint32_t *g_idata, uint32_t nonce)
 {
@ -486,352 +478,3 @@ extern "C" void post_keccak512(int thr_id, int stream, uint32_t nonce, int throu

 	cuda_post_keccak512<<<grid, block, 0, context_streams[stream][thr_id]>>>(context_odata[stream][thr_id], context_hash[stream][thr_id], nonce);
 }
-
-
-//
-// Maxcoin related Keccak implementation (Keccak256)
-//
-
-#include <stdint.h>
-
-#include <map>
-extern std::map<int, int> context_blocks;
-extern std::map<int, int> context_wpb;
-extern std::map<int, KernelInterface *> context_kernel;
-
-__constant__ uint64_t ptarget64[4];
-
-#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64))))
-#define ROL_mult8(a, offset) ROL(a, offset)
-
-__constant__ uint64_t KeccakF_RoundConstants[24];
-
-static uint64_t host_KeccakF_RoundConstants[24] = {
-	(uint64_t)0x0000000000000001ULL,
-	(uint64_t)0x0000000000008082ULL,
-	(uint64_t)0x800000000000808aULL,
-	(uint64_t)0x8000000080008000ULL,
-	(uint64_t)0x000000000000808bULL,
-	(uint64_t)0x0000000080000001ULL,
-	(uint64_t)0x8000000080008081ULL,
-	(uint64_t)0x8000000000008009ULL,
-	(uint64_t)0x000000000000008aULL,
-	(uint64_t)0x0000000000000088ULL,
-	(uint64_t)0x0000000080008009ULL,
-	(uint64_t)0x000000008000000aULL,
-	(uint64_t)0x000000008000808bULL,
-	(uint64_t)0x800000000000008bULL,
-	(uint64_t)0x8000000000008089ULL,
-	(uint64_t)0x8000000000008003ULL,
-	(uint64_t)0x8000000000008002ULL,
-	(uint64_t)0x8000000000000080ULL,
-	(uint64_t)0x000000000000800aULL,
-	(uint64_t)0x800000008000000aULL,
-	(uint64_t)0x8000000080008081ULL,
-	(uint64_t)0x8000000000008080ULL,
-	(uint64_t)0x0000000080000001ULL,
-	(uint64_t)0x8000000080008008ULL
-};
-
-__constant__ uint64_t pdata64[10];
-
-__global__
-void crypto_hash(uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate)
-{
-	uint64_t Aba, Abe, Abi, Abo, Abu;
-	uint64_t Aga, Age, Agi, Ago, Agu;
-	uint64_t Aka, Ake, Aki, Ako, Aku;
-	uint64_t Ama, Ame, Ami, Amo, Amu;
-	uint64_t Asa, Ase, Asi, Aso, Asu;
-	uint64_t BCa, BCe, BCi, BCo, BCu;
-	uint64_t Da, De, Di, Do, Du;
-	uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
-	uint64_t Ega, Ege, Egi, Ego, Egu;
-	uint64_t Eka, Eke, Eki, Eko, Eku;
-	uint64_t Ema, Eme, Emi, Emo, Emu;
-	uint64_t Esa, Ese, Esi, Eso, Esu;
-
-	//copyFromState(A, state)
-	Aba = pdata64[0];
-	Abe = pdata64[1];
-	Abi = pdata64[2];
-	Abo = pdata64[3];
-	Abu = pdata64[4];
-	Aga = pdata64[5];
-	Age = pdata64[6];
-	Agi = pdata64[7];
-	Ago = pdata64[8];
-	Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32);
-	Aka = 0x0000000000000001ULL;
-	Ake = 0;
-	Aki = 0;
-	Ako = 0;
-	Aku = 0;
-	Ama = 0;
-	Ame = 0x8000000000000000ULL;
-	Ami = 0;
-	Amo = 0;
-	Amu = 0;
-	Asa = 0;
-	Ase = 0;
-	Asi = 0;
-	Aso = 0;
-	Asu = 0;
-
-#pragma unroll 12
-	for( int laneCount = 0; laneCount < 24; laneCount += 2 )
-	{
-		//    prepareTheta
-		BCa = Aba^Aga^Aka^Ama^Asa;
-		BCe = Abe^Age^Ake^Ame^Ase;
-		BCi = Abi^Agi^Aki^Ami^Asi;
-		BCo = Abo^Ago^Ako^Amo^Aso;
-		BCu = Abu^Agu^Aku^Amu^Asu;
-
-		//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-		Da = BCu^ROL(BCe, 1);
-		De = BCa^ROL(BCi, 1);
-		Di = BCe^ROL(BCo, 1);
-		Do = BCi^ROL(BCu, 1);
-		Du = BCo^ROL(BCa, 1);
-
-		Aba ^= Da;
-		BCa = Aba;
-		Age ^= De;
-		BCe = ROL(Age, 44);
-		Aki ^= Di;
-		BCi = ROL(Aki, 43);
-		Amo ^= Do;
-		BCo = ROL(Amo, 21);
-		Asu ^= Du;
-		BCu = ROL(Asu, 14);
-		Eba =   BCa ^((~BCe)&  BCi );
-		Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount];
-		Ebe =   BCe ^((~BCi)&  BCo );
-		Ebi =   BCi ^((~BCo)&  BCu );
-		Ebo =   BCo ^((~BCu)&  BCa );
-		Ebu =   BCu ^((~BCa)&  BCe );
-
-		Abo ^= Do;
-		BCa = ROL(Abo, 28);
-		Agu ^= Du;
-		BCe = ROL(Agu, 20);
-		Aka ^= Da;
-		BCi = ROL(Aka,  3);
-		Ame ^= De;
-		BCo = ROL(Ame, 45);
-		Asi ^= Di;
-		BCu = ROL(Asi, 61);
-		Ega =   BCa ^((~BCe)&  BCi );
-		Ege =   BCe ^((~BCi)&  BCo );
-		Egi =   BCi ^((~BCo)&  BCu );
-		Ego =   BCo ^((~BCu)&  BCa );
-		Egu =   BCu ^((~BCa)&  BCe );
-
-		Abe ^= De;
-		BCa = ROL(Abe,  1);
-		Agi ^= Di;
-		BCe = ROL(Agi,  6);
-		Ako ^= Do;
-		BCi = ROL(Ako, 25);
-		Amu ^= Du;
-		BCo = ROL_mult8(Amu,  8);
-		Asa ^= Da;
-		BCu = ROL(Asa, 18);
-		Eka =   BCa ^((~BCe)&  BCi );
-		Eke =   BCe ^((~BCi)&  BCo );
-		Eki =   BCi ^((~BCo)&  BCu );
-		Eko =   BCo ^((~BCu)&  BCa );
-		Eku =   BCu ^((~BCa)&  BCe );
-
-		Abu ^= Du;
-		BCa = ROL(Abu, 27);
-		Aga ^= Da;
-		BCe = ROL(Aga, 36);
-		Ake ^= De;
-		BCi = ROL(Ake, 10);
-		Ami ^= Di;
-		BCo = ROL(Ami, 15);
-		Aso ^= Do;
-		BCu = ROL_mult8(Aso, 56);
-		Ema =   BCa ^((~BCe)&  BCi );
-		Eme =   BCe ^((~BCi)&  BCo );
-		Emi =   BCi ^((~BCo)&  BCu );
-		Emo =   BCo ^((~BCu)&  BCa );
-		Emu =   BCu ^((~BCa)&  BCe );
-
-		Abi ^= Di;
-		BCa = ROL(Abi, 62);
-		Ago ^= Do;
-		BCe = ROL(Ago, 55);
-		Aku ^= Du;
-		BCi = ROL(Aku, 39);
-		Ama ^= Da;
-		BCo = ROL(Ama, 41);
-		Ase ^= De;
-		BCu = ROL(Ase,  2);
-		Esa =   BCa ^((~BCe)&  BCi );
-		Ese =   BCe ^((~BCi)&  BCo );
-		Esi =   BCi ^((~BCo)&  BCu );
-		Eso =   BCo ^((~BCu)&  BCa );
-		Esu =   BCu ^((~BCa)&  BCe );
-
-		//    prepareTheta
-		BCa = Eba^Ega^Eka^Ema^Esa;
-		BCe = Ebe^Ege^Eke^Eme^Ese;
-		BCi = Ebi^Egi^Eki^Emi^Esi;
-		BCo = Ebo^Ego^Eko^Emo^Eso;
-		BCu = Ebu^Egu^Eku^Emu^Esu;
-
-		//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-		Da = BCu^ROL(BCe, 1);
-		De = BCa^ROL(BCi, 1);
-		Di = BCe^ROL(BCo, 1);
-		Do = BCi^ROL(BCu, 1);
-		Du = BCo^ROL(BCa, 1);
-
-		Eba ^= Da;
-		BCa = Eba;
-		Ege ^= De;
-		BCe = ROL(Ege, 44);
-		Eki ^= Di;
-		BCi = ROL(Eki, 43);
-		Emo ^= Do;
-		BCo = ROL(Emo, 21);
-		Esu ^= Du;
-		BCu = ROL(Esu, 14);
-		Aba =   BCa ^((~BCe)&  BCi );
-		Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1];
-		Abe =   BCe ^((~BCi)&  BCo );
-		Abi =   BCi ^((~BCo)&  BCu );
-		Abo =   BCo ^((~BCu)&  BCa );
-		Abu =   BCu ^((~BCa)&  BCe );
-
-		Ebo ^= Do;
-		BCa = ROL(Ebo, 28);
-		Egu ^= Du;
-		BCe = ROL(Egu, 20);
-		Eka ^= Da;
-		BCi = ROL(Eka, 3);
-		Eme ^= De;
-		BCo = ROL(Eme, 45);
-		Esi ^= Di;
-		BCu = ROL(Esi, 61);
-		Aga =   BCa ^((~BCe)&  BCi );
-		Age =   BCe ^((~BCi)&  BCo );
-		Agi =   BCi ^((~BCo)&  BCu );
-		Ago =   BCo ^((~BCu)&  BCa );
-		Agu =   BCu ^((~BCa)&  BCe );
-
-		Ebe ^= De;
-		BCa = ROL(Ebe, 1);
-		Egi ^= Di;
-		BCe = ROL(Egi, 6);
-		Eko ^= Do;
-		BCi = ROL(Eko, 25);
-		Emu ^= Du;
-		BCo = ROL_mult8(Emu, 8);
-		Esa ^= Da;
-		BCu = ROL(Esa, 18);
-		Aka =   BCa ^((~BCe)&  BCi );
-		Ake =   BCe ^((~BCi)&  BCo );
-		Aki =   BCi ^((~BCo)&  BCu );
-		Ako =   BCo ^((~BCu)&  BCa );
-		Aku =   BCu ^((~BCa)&  BCe );
-
-		Ebu ^= Du;
-		BCa = ROL(Ebu, 27);
-		Ega ^= Da;
-		BCe = ROL(Ega, 36);
-		Eke ^= De;
-		BCi = ROL(Eke, 10);
-		Emi ^= Di;
-		BCo = ROL(Emi, 15);
-		Eso ^= Do;
-		BCu = ROL_mult8(Eso, 56);
-		Ama =   BCa ^((~BCe)&  BCi );
-		Ame =   BCe ^((~BCi)&  BCo );
-		Ami =   BCi ^((~BCo)&  BCu );
-		Amo =   BCo ^((~BCu)&  BCa );
-		Amu =   BCu ^((~BCa)&  BCe );
-
-		Ebi ^= Di;
-		BCa = ROL(Ebi, 62);
-		Ego ^= Do;
-		BCe = ROL(Ego, 55);
-		Eku ^= Du;
-		BCi = ROL(Eku, 39);
-		Ema ^= Da;
-		BCo = ROL(Ema, 41);
-		Ese ^= De;
-		BCu = ROL(Ese, 2);
-		Asa =   BCa ^((~BCe)&  BCi );
-		Ase =   BCe ^((~BCi)&  BCo );
-		Asi =   BCi ^((~BCo)&  BCu );
-		Aso =   BCo ^((~BCu)&  BCa );
-		Asu =   BCu ^((~BCa)&  BCe );
-	}
-
-	if (validate) {
-		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
-		g_out[3] = Abo;
-		g_out[2] = Abi;
-		g_out[1] = Abe;
-		g_out[0] = Aba;
-	}
-
-	// the likelyhood of meeting the hashing target is so low, that we're not guarding this
-	// with atomic writes, locks or similar...
-	uint64_t *g_good64 = (uint64_t*)g_good;
-	if (Abo <=  ptarget64[3]) {
-		if (Abo < g_good64[3]) {
-			g_good64[3] = Abo;
-			g_good64[2] = Abi;
-			g_good64[1] = Abe;
-			g_good64[0] = Aba;
-			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
-		}
-	}
-}
-
-static std::map<int, uint32_t *> context_good[2];
-
-bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
-{
-	static bool init[MAX_GPUS] = { 0 };
-
-	if (!init[thr_id])
-	{
-		checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice));
-
-		// allocate pinned host memory for good hashes
-		uint32_t *tmp;
-		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
-		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
-
-		init[thr_id] = true;
-	}
-	checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-
-	return context_good[0][thr_id] && context_good[1][thr_id];
-}
-
-void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
-{
-	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
-
-	crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
-
-	// copy hashes from device memory to host (ALL hashes, lots of data...)
-	if (do_d2h && hash != NULL) {
-		size_t mem_size = throughput * sizeof(uint32_t) * 8;
-		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
-						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
-	}
-	else if (hash != NULL) {
-		// asynchronous copy of winning nonce (just 4 bytes...)
-		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
-						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
-	}
-}
--- a/scrypt/nv_kernel.cu
+++ b/scrypt/nv_kernel.cu
@ -708,784 +708,3 @@ void nv_scrypt_core_kernelB_LG(uint32_t *g_odata, int begin, int end, unsigned i

 	__transposed_write_BC(B, C, (uint4*)(g_odata), 1);
 }
-
-
-
-//
-// Maxcoin related Keccak implementation (Keccak256)
-//
-
-// from salsa_kernel.cu
-extern std::map<int, int> context_blocks;
-extern std::map<int, int> context_wpb;
-extern std::map<int, KernelInterface *> context_kernel;
-extern std::map<int, cudaStream_t> context_streams[2];
-extern std::map<int, uint32_t *> context_hash[2];
-
-__constant__ uint64_t ptarget64[4];
-
-#define ROL(a, offset) ((((uint64_t)a) << ((offset) % 64)) ^ (((uint64_t)a) >> (64-((offset) % 64))))
-#define ROL_mult8(a, offset) ROL(a, offset)
-
-__constant__ uint64_t KeccakF_RoundConstants[24];
-static uint64_t host_KeccakF_RoundConstants[24] = {
-	(uint64_t)0x0000000000000001ULL,
-	(uint64_t)0x0000000000008082ULL,
-	(uint64_t)0x800000000000808aULL,
-	(uint64_t)0x8000000080008000ULL,
-	(uint64_t)0x000000000000808bULL,
-	(uint64_t)0x0000000080000001ULL,
-	(uint64_t)0x8000000080008081ULL,
-	(uint64_t)0x8000000000008009ULL,
-	(uint64_t)0x000000000000008aULL,
-	(uint64_t)0x0000000000000088ULL,
-	(uint64_t)0x0000000080008009ULL,
-	(uint64_t)0x000000008000000aULL,
-	(uint64_t)0x000000008000808bULL,
-	(uint64_t)0x800000000000008bULL,
-	(uint64_t)0x8000000000008089ULL,
-	(uint64_t)0x8000000000008003ULL,
-	(uint64_t)0x8000000000008002ULL,
-	(uint64_t)0x8000000000000080ULL,
-	(uint64_t)0x000000000000800aULL,
-	(uint64_t)0x800000008000000aULL,
-	(uint64_t)0x8000000080008081ULL,
-	(uint64_t)0x8000000000008080ULL,
-	(uint64_t)0x0000000080000001ULL,
-	(uint64_t)0x8000000080008008ULL
-};
-
-__constant__ uint64_t pdata64[10];
-
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
-		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
-}
-
-__global__
-void kepler_crypto_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate )
-{
-	uint64_t Aba, Abe, Abi, Abo, Abu;
-	uint64_t Aga, Age, Agi, Ago, Agu;
-	uint64_t Aka, Ake, Aki, Ako, Aku;
-	uint64_t Ama, Ame, Ami, Amo, Amu;
-	uint64_t Asa, Ase, Asi, Aso, Asu;
-	uint64_t BCa, BCe, BCi, BCo, BCu;
-	uint64_t Da, De, Di, Do, Du;
-	uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
-	uint64_t Ega, Ege, Egi, Ego, Egu;
-	uint64_t Eka, Eke, Eki, Eko, Eku;
-	uint64_t Ema, Eme, Emi, Emo, Emu;
-	uint64_t Esa, Ese, Esi, Eso, Esu;
-
-	//copyFromState(A, state)
-	Aba = pdata64[0];
-	Abe = pdata64[1];
-	Abi = pdata64[2];
-	Abo = pdata64[3];
-	Abu = pdata64[4];
-	Aga = pdata64[5];
-	Age = pdata64[6];
-	Agi = pdata64[7];
-	Ago = pdata64[8];
-	Agu = (pdata64[9] & 0x00000000FFFFFFFFULL) | (((uint64_t)cuda_swab32(nonce + ((blockIdx.x * blockDim.x) + threadIdx.x))) << 32);
-	Aka = 0x0000000000000001ULL;
-	Ake = 0;
-	Aki = 0;
-	Ako = 0;
-	Aku = 0;
-	Ama = 0;
-	Ame = 0x8000000000000000ULL;
-	Ami = 0;
-	Amo = 0;
-	Amu = 0;
-	Asa = 0;
-	Ase = 0;
-	Asi = 0;
-	Aso = 0;
-	Asu = 0;
-
-#pragma unroll 12
-	for( int laneCount = 0; laneCount < 24; laneCount += 2 )
-	{
-		//    prepareTheta
-		BCa = Aba^Aga^Aka^Ama^Asa;
-		BCe = Abe^Age^Ake^Ame^Ase;
-		BCi = Abi^Agi^Aki^Ami^Asi;
-		BCo = Abo^Ago^Ako^Amo^Aso;
-		BCu = Abu^Agu^Aku^Amu^Asu;
-
-		//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-		Da = BCu^ROL(BCe, 1);
-		De = BCa^ROL(BCi, 1);
-		Di = BCe^ROL(BCo, 1);
-		Do = BCi^ROL(BCu, 1);
-		Du = BCo^ROL(BCa, 1);
-
-		Aba ^= Da;
-		BCa = Aba;
-		Age ^= De;
-		BCe = ROL(Age, 44);
-		Aki ^= Di;
-		BCi = ROL(Aki, 43);
-		Amo ^= Do;
-		BCo = ROL(Amo, 21);
-		Asu ^= Du;
-		BCu = ROL(Asu, 14);
-		Eba =   BCa ^((~BCe)&  BCi );
-		Eba ^= (uint64_t)KeccakF_RoundConstants[laneCount];
-		Ebe =   BCe ^((~BCi)&  BCo );
-		Ebi =   BCi ^((~BCo)&  BCu );
-		Ebo =   BCo ^((~BCu)&  BCa );
-		Ebu =   BCu ^((~BCa)&  BCe );
-
-		Abo ^= Do;
-		BCa = ROL(Abo, 28);
-		Agu ^= Du;
-		BCe = ROL(Agu, 20);
-		Aka ^= Da;
-		BCi = ROL(Aka,  3);
-		Ame ^= De;
-		BCo = ROL(Ame, 45);
-		Asi ^= Di;
-		BCu = ROL(Asi, 61);
-		Ega =   BCa ^((~BCe)&  BCi );
-		Ege =   BCe ^((~BCi)&  BCo );
-		Egi =   BCi ^((~BCo)&  BCu );
-		Ego =   BCo ^((~BCu)&  BCa );
-		Egu =   BCu ^((~BCa)&  BCe );
-
-		Abe ^= De;
-		BCa = ROL(Abe,  1);
-		Agi ^= Di;
-		BCe = ROL(Agi,  6);
-		Ako ^= Do;
-		BCi = ROL(Ako, 25);
-		Amu ^= Du;
-		BCo = ROL_mult8(Amu,  8);
-		Asa ^= Da;
-		BCu = ROL(Asa, 18);
-		Eka =   BCa ^((~BCe)&  BCi );
-		Eke =   BCe ^((~BCi)&  BCo );
-		Eki =   BCi ^((~BCo)&  BCu );
-		Eko =   BCo ^((~BCu)&  BCa );
-		Eku =   BCu ^((~BCa)&  BCe );
-
-		Abu ^= Du;
-		BCa = ROL(Abu, 27);
-		Aga ^= Da;
-		BCe = ROL(Aga, 36);
-		Ake ^= De;
-		BCi = ROL(Ake, 10);
-		Ami ^= Di;
-		BCo = ROL(Ami, 15);
-		Aso ^= Do;
-		BCu = ROL_mult8(Aso, 56);
-		Ema =   BCa ^((~BCe)&  BCi );
-		Eme =   BCe ^((~BCi)&  BCo );
-		Emi =   BCi ^((~BCo)&  BCu );
-		Emo =   BCo ^((~BCu)&  BCa );
-		Emu =   BCu ^((~BCa)&  BCe );
-
-		Abi ^= Di;
-		BCa = ROL(Abi, 62);
-		Ago ^= Do;
-		BCe = ROL(Ago, 55);
-		Aku ^= Du;
-		BCi = ROL(Aku, 39);
-		Ama ^= Da;
-		BCo = ROL(Ama, 41);
-		Ase ^= De;
-		BCu = ROL(Ase,  2);
-		Esa =   BCa ^((~BCe)&  BCi );
-		Ese =   BCe ^((~BCi)&  BCo );
-		Esi =   BCi ^((~BCo)&  BCu );
-		Eso =   BCo ^((~BCu)&  BCa );
-		Esu =   BCu ^((~BCa)&  BCe );
-
-		//    prepareTheta
-		BCa = Eba^Ega^Eka^Ema^Esa;
-		BCe = Ebe^Ege^Eke^Eme^Ese;
-		BCi = Ebi^Egi^Eki^Emi^Esi;
-		BCo = Ebo^Ego^Eko^Emo^Eso;
-		BCu = Ebu^Egu^Eku^Emu^Esu;
-
-		//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-		Da = BCu^ROL(BCe, 1);
-		De = BCa^ROL(BCi, 1);
-		Di = BCe^ROL(BCo, 1);
-		Do = BCi^ROL(BCu, 1);
-		Du = BCo^ROL(BCa, 1);
-
-		Eba ^= Da;
-		BCa = Eba;
-		Ege ^= De;
-		BCe = ROL(Ege, 44);
-		Eki ^= Di;
-		BCi = ROL(Eki, 43);
-		Emo ^= Do;
-		BCo = ROL(Emo, 21);
-		Esu ^= Du;
-		BCu = ROL(Esu, 14);
-		Aba =   BCa ^((~BCe)&  BCi );
-		Aba ^= (uint64_t)KeccakF_RoundConstants[laneCount+1];
-		Abe =   BCe ^((~BCi)&  BCo );
-		Abi =   BCi ^((~BCo)&  BCu );
-		Abo =   BCo ^((~BCu)&  BCa );
-		Abu =   BCu ^((~BCa)&  BCe );
-
-		Ebo ^= Do;
-		BCa = ROL(Ebo, 28);
-		Egu ^= Du;
-		BCe = ROL(Egu, 20);
-		Eka ^= Da;
-		BCi = ROL(Eka, 3);
-		Eme ^= De;
-		BCo = ROL(Eme, 45);
-		Esi ^= Di;
-		BCu = ROL(Esi, 61);
-		Aga =   BCa ^((~BCe)&  BCi );
-		Age =   BCe ^((~BCi)&  BCo );
-		Agi =   BCi ^((~BCo)&  BCu );
-		Ago =   BCo ^((~BCu)&  BCa );
-		Agu =   BCu ^((~BCa)&  BCe );
-
-		Ebe ^= De;
-		BCa = ROL(Ebe, 1);
-		Egi ^= Di;
-		BCe = ROL(Egi, 6);
-		Eko ^= Do;
-		BCi = ROL(Eko, 25);
-		Emu ^= Du;
-		BCo = ROL_mult8(Emu, 8);
-		Esa ^= Da;
-		BCu = ROL(Esa, 18);
-		Aka =   BCa ^((~BCe)&  BCi );
-		Ake =   BCe ^((~BCi)&  BCo );
-		Aki =   BCi ^((~BCo)&  BCu );
-		Ako =   BCo ^((~BCu)&  BCa );
-		Aku =   BCu ^((~BCa)&  BCe );
-
-		Ebu ^= Du;
-		BCa = ROL(Ebu, 27);
-		Ega ^= Da;
-		BCe = ROL(Ega, 36);
-		Eke ^= De;
-		BCi = ROL(Eke, 10);
-		Emi ^= Di;
-		BCo = ROL(Emi, 15);
-		Eso ^= Do;
-		BCu = ROL_mult8(Eso, 56);
-		Ama =   BCa ^((~BCe)&  BCi );
-		Ame =   BCe ^((~BCi)&  BCo );
-		Ami =   BCi ^((~BCo)&  BCu );
-		Amo =   BCo ^((~BCu)&  BCa );
-		Amu =   BCu ^((~BCa)&  BCe );
-
-		Ebi ^= Di;
-		BCa = ROL(Ebi, 62);
-		Ego ^= Do;
-		BCe = ROL(Ego, 55);
-		Eku ^= Du;
-		BCi = ROL(Eku, 39);
-		Ema ^= Da;
-		BCo = ROL(Ema, 41);
-		Ese ^= De;
-		BCu = ROL(Ese, 2);
-		Asa =   BCa ^((~BCe)&  BCi );
-		Ase =   BCe ^((~BCi)&  BCo );
-		Asi =   BCi ^((~BCo)&  BCu );
-		Aso =   BCo ^((~BCu)&  BCa );
-		Asu =   BCu ^((~BCa)&  BCe );
-	}
-
-	if (validate) {
-		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
-		g_out[3] = Abo;
-		g_out[2] = Abi;
-		g_out[1] = Abe;
-		g_out[0] = Aba;
-	}
-
-	// the likelyhood of meeting the hashing target is so low, that we're not guarding this
-	// with atomic writes, locks or similar...
-	uint64_t *g_good64 = (uint64_t*)g_good;
-	if (Abo <=  ptarget64[3]) {
-		if (Abo < g_good64[3]) {
-			g_good64[3] = Abo;
-			g_good64[2] = Abi;
-			g_good64[1] = Abe;
-			g_good64[0] = Aba;
-			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
-		}
-	}
-}
-
-static std::map<int, uint32_t *> context_good[2];
-
-bool NVKernel::prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
-{
-	static bool init[MAX_GPUS] = { 0 };
-
-	if (!init[thr_id])
-	{
-		checkCudaErrors(cudaMemcpyToSymbol(KeccakF_RoundConstants, host_KeccakF_RoundConstants, sizeof(host_KeccakF_RoundConstants), 0, cudaMemcpyHostToDevice));
-
-		// allocate pinned host memory for good hashes
-		uint32_t *tmp;
-		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
-		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
-
-		init[thr_id] = true;
-	}
-	checkCudaErrors(cudaMemcpyToSymbol(pdata64, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-
-	return context_good[0][thr_id] && context_good[1][thr_id];
-}
-
-void NVKernel::do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
-{
-	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
-
-	kepler_crypto_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
-
-	// copy hashes from device memory to host (ALL hashes, lots of data...)
-	if (do_d2h && hash != NULL) {
-		size_t mem_size = throughput * sizeof(uint32_t) * 8;
-		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
-						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
-	}
-	else if (hash != NULL) {
-		// asynchronous copy of winning nonce (just 4 bytes...)
-		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
-						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
-	}
-}
-
-
-//
-// Blakecoin related Keccak implementation (Keccak256)
-//
-
-typedef uint32_t sph_u32;
-#define SPH_C32(x) ((sph_u32)(x))
-#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
-#if __CUDA_ARCH__ < 350
-	// Kepler (Compute 3.0)
-	#define SPH_ROTL32(a, b) ((a)<<(b))|((a)>>(32-(b)))
-#else
-	// Kepler (Compute 3.5)
-	#define SPH_ROTL32(a, b) __funnelshift_l( a, a, b );
-#endif
-#define SPH_ROTR32(x, n)   SPH_ROTL32(x, (32 - (n)))
-
-__constant__ uint32_t pdata[20];
-
-#ifdef _MSC_VER
-#pragma warning (disable: 4146)
-#endif
-
-static __device__ sph_u32 cuda_sph_bswap32(sph_u32 x)
-{
-	return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
-		  | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
-}
-
-/**
- * Encode a 32-bit value into the provided buffer (big endian convention).
- *
- * @param dst   the destination buffer
- * @param val   the 32-bit value to encode
- */
-static __device__ void
-cuda_sph_enc32be(void *dst, sph_u32 val)
-{
-	*(sph_u32 *)dst = cuda_sph_bswap32(val);
-}
-
-#define Z00   0
-#define Z01   1
-#define Z02   2
-#define Z03   3
-#define Z04   4
-#define Z05   5
-#define Z06   6
-#define Z07   7
-#define Z08   8
-#define Z09   9
-#define Z0A   A
-#define Z0B   B
-#define Z0C   C
-#define Z0D   D
-#define Z0E   E
-#define Z0F   F
-
-#define Z10   E
-#define Z11   A
-#define Z12   4
-#define Z13   8
-#define Z14   9
-#define Z15   F
-#define Z16   D
-#define Z17   6
-#define Z18   1
-#define Z19   C
-#define Z1A   0
-#define Z1B   2
-#define Z1C   B
-#define Z1D   7
-#define Z1E   5
-#define Z1F   3
-
-#define Z20   B
-#define Z21   8
-#define Z22   C
-#define Z23   0
-#define Z24   5
-#define Z25   2
-#define Z26   F
-#define Z27   D
-#define Z28   A
-#define Z29   E
-#define Z2A   3
-#define Z2B   6
-#define Z2C   7
-#define Z2D   1
-#define Z2E   9
-#define Z2F   4
-
-#define Z30   7
-#define Z31   9
-#define Z32   3
-#define Z33   1
-#define Z34   D
-#define Z35   C
-#define Z36   B
-#define Z37   E
-#define Z38   2
-#define Z39   6
-#define Z3A   5
-#define Z3B   A
-#define Z3C   4
-#define Z3D   0
-#define Z3E   F
-#define Z3F   8
-
-#define Z40   9
-#define Z41   0
-#define Z42   5
-#define Z43   7
-#define Z44   2
-#define Z45   4
-#define Z46   A
-#define Z47   F
-#define Z48   E
-#define Z49   1
-#define Z4A   B
-#define Z4B   C
-#define Z4C   6
-#define Z4D   8
-#define Z4E   3
-#define Z4F   D
-
-#define Z50   2
-#define Z51   C
-#define Z52   6
-#define Z53   A
-#define Z54   0
-#define Z55   B
-#define Z56   8
-#define Z57   3
-#define Z58   4
-#define Z59   D
-#define Z5A   7
-#define Z5B   5
-#define Z5C   F
-#define Z5D   E
-#define Z5E   1
-#define Z5F   9
-
-#define Z60   C
-#define Z61   5
-#define Z62   1
-#define Z63   F
-#define Z64   E
-#define Z65   D
-#define Z66   4
-#define Z67   A
-#define Z68   0
-#define Z69   7
-#define Z6A   6
-#define Z6B   3
-#define Z6C   9
-#define Z6D   2
-#define Z6E   8
-#define Z6F   B
-
-#define Z70   D
-#define Z71   B
-#define Z72   7
-#define Z73   E
-#define Z74   C
-#define Z75   1
-#define Z76   3
-#define Z77   9
-#define Z78   5
-#define Z79   0
-#define Z7A   F
-#define Z7B   4
-#define Z7C   8
-#define Z7D   6
-#define Z7E   2
-#define Z7F   A
-
-#define Z80   6
-#define Z81   F
-#define Z82   E
-#define Z83   9
-#define Z84   B
-#define Z85   3
-#define Z86   0
-#define Z87   8
-#define Z88   C
-#define Z89   2
-#define Z8A   D
-#define Z8B   7
-#define Z8C   1
-#define Z8D   4
-#define Z8E   A
-#define Z8F   5
-
-#define Z90   A
-#define Z91   2
-#define Z92   8
-#define Z93   4
-#define Z94   7
-#define Z95   6
-#define Z96   1
-#define Z97   5
-#define Z98   F
-#define Z99   B
-#define Z9A   9
-#define Z9B   E
-#define Z9C   3
-#define Z9D   C
-#define Z9E   D
-#define Z9F   0
-
-#define Mx(r, i)    Mx_(Z ## r ## i)
-#define Mx_(n)      Mx__(n)
-#define Mx__(n)     M ## n
-
-#define CSx(r, i)   CSx_(Z ## r ## i)
-#define CSx_(n)     CSx__(n)
-#define CSx__(n)    CS ## n
-
-#define CS0   SPH_C32(0x243F6A88)
-#define CS1   SPH_C32(0x85A308D3)
-#define CS2   SPH_C32(0x13198A2E)
-#define CS3   SPH_C32(0x03707344)
-#define CS4   SPH_C32(0xA4093822)
-#define CS5   SPH_C32(0x299F31D0)
-#define CS6   SPH_C32(0x082EFA98)
-#define CS7   SPH_C32(0xEC4E6C89)
-#define CS8   SPH_C32(0x452821E6)
-#define CS9   SPH_C32(0x38D01377)
-#define CSA   SPH_C32(0xBE5466CF)
-#define CSB   SPH_C32(0x34E90C6C)
-#define CSC   SPH_C32(0xC0AC29B7)
-#define CSD   SPH_C32(0xC97C50DD)
-#define CSE   SPH_C32(0x3F84D5B5)
-#define CSF   SPH_C32(0xB5470917)
-
-#define GS(m0, m1, c0, c1, a, b, c, d)   do { \
-		a = SPH_T32(a + b + (m0 ^ c1)); \
-		d = SPH_ROTR32(d ^ a, 16); \
-		c = SPH_T32(c + d); \
-		b = SPH_ROTR32(b ^ c, 12); \
-		a = SPH_T32(a + b + (m1 ^ c0)); \
-		d = SPH_ROTR32(d ^ a, 8); \
-		c = SPH_T32(c + d); \
-		b = SPH_ROTR32(b ^ c, 7); \
-	} while (0)
-
-#define ROUND_S(r)   do { \
-		GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \
-		GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \
-		GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \
-		GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \
-		GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \
-		GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \
-		GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \
-		GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \
-	} while (0)
-
-#define COMPRESS32   do { \
-		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \
-		sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \
-		sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \
-		sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \
-		V0 = H0; \
-		V1 = H1; \
-		V2 = H2; \
-		V3 = H3; \
-		V4 = H4; \
-		V5 = H5; \
-		V6 = H6; \
-		V7 = H7; \
-		V8 = S0 ^ CS0; \
-		V9 = S1 ^ CS1; \
-		VA = S2 ^ CS2; \
-		VB = S3 ^ CS3; \
-		VC = T0 ^ CS4; \
-		VD = T0 ^ CS5; \
-		VE = T1 ^ CS6; \
-		VF = T1 ^ CS7; \
-		M0 = input[0]; \
-		M1 = input[1]; \
-		M2 = input[2]; \
-		M3 = input[3]; \
-		M4 = input[4]; \
-		M5 = input[5]; \
-		M6 = input[6]; \
-		M7 = input[7]; \
-		M8 = input[8]; \
-		M9 = input[9]; \
-		MA = input[10]; \
-		MB = input[11]; \
-		MC = input[12]; \
-		MD = input[13]; \
-		ME = input[14]; \
-		MF = input[15]; \
-		ROUND_S(0); \
-		ROUND_S(1); \
-		ROUND_S(2); \
-		ROUND_S(3); \
-		ROUND_S(4); \
-		ROUND_S(5); \
-		ROUND_S(6); \
-		ROUND_S(7); \
-		H0 ^= S0 ^ V0 ^ V8; \
-		H1 ^= S1 ^ V1 ^ V9; \
-		H2 ^= S2 ^ V2 ^ VA; \
-		H3 ^= S3 ^ V3 ^ VB; \
-		H4 ^= S0 ^ V4 ^ VC; \
-		H5 ^= S1 ^ V5 ^ VD; \
-		H6 ^= S2 ^ V6 ^ VE; \
-		H7 ^= S3 ^ V7 ^ VF; \
-	} while (0)
-
-
-__global__
-void kepler_blake256_hash( uint64_t *g_out, uint32_t nonce, uint32_t *g_good, bool validate)
-{
-	uint32_t input[16];
-	uint64_t output[4];
-
-	#pragma unroll
-	for (int i=0; i < 16; ++i) input[i] = pdata[i];
-
-	sph_u32 H0 = 0x6A09E667;
-	sph_u32 H1 = 0xBB67AE85;
-	sph_u32 H2 = 0x3C6EF372;
-	sph_u32 H3 = 0xA54FF53A;
-	sph_u32 H4 = 0x510E527F;
-	sph_u32 H5 = 0x9B05688C;
-	sph_u32 H6 = 0x1F83D9AB;
-	sph_u32 H7 = 0x5BE0CD19;
-	sph_u32 S0 = 0;
-	sph_u32 S1 = 0;
-	sph_u32 S2 = 0;
-	sph_u32 S3 = 0;
-	sph_u32 T0 = 0;
-	sph_u32 T1 = 0;
-	T0 = SPH_T32(T0 + 512);
-	COMPRESS32;
-
-	#pragma unroll
-	for (int i=0; i < 3; ++i) input[i] = pdata[16+i];
-
-	input[3] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
-	input[4] = 0x80000000;
-
-	#pragma unroll 8
-	for (int i=5; i < 13; ++i) input[i] = 0;
-
-	input[13] = 0x00000001;
-	input[14] = T1;
-	input[15] = T0 + 128;
-
-	T0 = SPH_T32(T0 + 128);
-	COMPRESS32;
-
-	cuda_sph_enc32be((unsigned char*)output + 4*6, H6);
-	cuda_sph_enc32be((unsigned char*)output + 4*7, H7);
-	if (validate || output[3] <=  ptarget64[3])
-	{
-		// this data is only needed when we actually need to save the hashes
-		cuda_sph_enc32be((unsigned char*)output + 4*0, H0);
-		cuda_sph_enc32be((unsigned char*)output + 4*1, H1);
-		cuda_sph_enc32be((unsigned char*)output + 4*2, H2);
-		cuda_sph_enc32be((unsigned char*)output + 4*3, H3);
-		cuda_sph_enc32be((unsigned char*)output + 4*4, H4);
-		cuda_sph_enc32be((unsigned char*)output + 4*5, H5);
-	}
-
-	if (validate)
-	{
-		g_out += 4 * ((blockIdx.x * blockDim.x) + threadIdx.x);
-		#pragma unroll
-		for (int i=0; i < 4; ++i) g_out[i] = output[i];
-	}
-
-	if (output[3] <=  ptarget64[3]) {
-		uint64_t *g_good64 = (uint64_t*)g_good;
-		if (output[3] < g_good64[3]) {
-			g_good64[3] = output[3];
-			g_good64[2] = output[2];
-			g_good64[1] = output[1];
-			g_good64[0] = output[0];
-			g_good[8] = nonce + ((blockIdx.x * blockDim.x) + threadIdx.x);
-		}
-	}
-}
-
-bool NVKernel::prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8])
-{
-	static bool init[MAX_GPUS] = { 0 };
-
-	if (!init[thr_id])
-	{
-		// allocate pinned host memory for good hashes
-		uint32_t *tmp;
-		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[0][thr_id] = tmp;
-		checkCudaErrors(cudaMalloc((void **) &tmp, 9*sizeof(uint32_t))); context_good[1][thr_id] = tmp;
-
-		init[thr_id] = true;
-	}
-	checkCudaErrors(cudaMemcpyToSymbol(pdata, host_pdata, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-	checkCudaErrors(cudaMemcpyToSymbol(ptarget64, host_ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-
-	return context_good[0][thr_id] && context_good[1][thr_id];
-}
-
-void NVKernel::do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
-{
-	checkCudaErrors(cudaMemsetAsync(context_good[stream][thr_id], 0xff, 9 * sizeof(uint32_t), context_streams[stream][thr_id]));
-
-	kepler_blake256_hash<<<grid, threads, 0, context_streams[stream][thr_id]>>>((uint64_t*)context_hash[stream][thr_id], nonce, context_good[stream][thr_id], do_d2h);
-
-	// copy hashes from device memory to host (ALL hashes, lots of data...)
-	if (do_d2h && hash != NULL) {
-		size_t mem_size = throughput * sizeof(uint32_t) * 8;
-		checkCudaErrors(cudaMemcpyAsync(hash, context_hash[stream][thr_id], mem_size,
-						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
-	}
-	else if (hash != NULL) {
-		// asynchronous copy of winning nonce (just 4 bytes...)
-		checkCudaErrors(cudaMemcpyAsync(hash, context_good[stream][thr_id]+8, sizeof(uint32_t),
-						cudaMemcpyDeviceToHost, context_streams[stream][thr_id]));
-	}
-}
--- a/scrypt/nv_kernel.h
+++ b/scrypt/nv_kernel.h
@ -25,12 +25,6 @@ public:
 	virtual bool support_lookup_gap() { return true; }
 	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
 	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
-
-	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
-	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
-
-	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]);
-	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
 };

 #endif // #ifndef NV_KERNEL_H
--- a/scrypt/nv_kernel2.cu
+++ b/scrypt/nv_kernel2.cu
--- a/scrypt/nv_kernel2.h
+++ b/scrypt/nv_kernel2.h
@ -25,12 +25,6 @@ public:

 	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeFourByte; }
 	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferL1; }
-
-	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
-	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
-
-	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t host_ptarget[8]);
-	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false);
 };

 #endif // #ifndef NV2_KERNEL_H
--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@ -821,44 +821,6 @@ void cuda_scrypt_core(int thr_id, int stream, unsigned int N)
 	);
 }

-bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
-{
-	return context_kernel[thr_id]->prepare_keccak256(thr_id, host_pdata, ptarget);
-}
-#if 0
-void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
-{
-	unsigned int GRID_BLOCKS = context_blocks[thr_id];
-	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
-	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
-
-	// setup execution parameters
-	dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
-	dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
-
-	context_kernel[thr_id]->do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
-}
-#endif
-bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8])
-{
-	return context_kernel[thr_id]->prepare_blake256(thr_id, host_pdata, ptarget);
-}
-
-#if 0
-void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h)
-{
-	unsigned int GRID_BLOCKS = context_blocks[thr_id];
-	unsigned int WARPS_PER_BLOCK = context_wpb[thr_id];
-	unsigned int THREADS_PER_WU = context_kernel[thr_id]->threads_per_wu();
-
-	// setup execution parameters
-	dim3  grid(WU_PER_LAUNCH/WU_PER_BLOCK, 1, 1);
-	dim3  threads(THREADS_PER_WU*WU_PER_BLOCK, 1, 1);
-
-	context_kernel[thr_id]->do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
-}
-#endif
-
 void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA)
 {
 	unsigned int GRID_BLOCKS = context_blocks[thr_id];
--- a/scrypt/salsa_kernel.h
+++ b/scrypt/salsa_kernel.h
@ -58,20 +58,6 @@ extern void cuda_scrypt_DtoH(int thr_id, uint32_t *X, int stream, bool postSHA);
 extern bool cuda_scrypt_sync(int thr_id, int stream);
 extern void cuda_scrypt_flush(int thr_id, int stream);

-extern bool cuda_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
-extern void cuda_do_keccak256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
-
-extern bool cuda_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
-extern void cuda_do_blake256(int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
-
-extern bool default_prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
-extern bool default_prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]);
-
-#ifdef __NVCC__
-extern void default_do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
-extern void default_do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h);
-#endif
-
 // If we're in C++ mode, we're either compiling .cu files or scrypt.cpp

 #ifdef __NVCC__
@ -101,20 +87,6 @@ public:
 	virtual bool support_lookup_gap() { return false; }
 	virtual cudaSharedMemConfig shared_mem_config() { return cudaSharedMemBankSizeDefault; }
 	virtual cudaFuncCache cache_config() { return cudaFuncCachePreferNone; }
-
-	virtual bool prepare_keccak256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
-		return default_prepare_keccak256(thr_id, host_pdata, ptarget);
-	}
-	virtual void do_keccak256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
-		default_do_keccak256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
-	}
-
-	virtual bool prepare_blake256(int thr_id, const uint32_t host_pdata[20], const uint32_t ptarget[8]) {
-		return default_prepare_blake256(thr_id, host_pdata, ptarget);
-	}
-	virtual void do_blake256(dim3 grid, dim3 threads, int thr_id, int stream, uint32_t *hash, uint32_t nonce, int throughput, bool do_d2h = false) {
-		default_do_blake256(grid, threads, thr_id, stream, hash, nonce, throughput, do_d2h);
-	}
 };

 // Not performing error checking is actually bad, but...