import and adapt alexis optimised keccak256 for SM 5+

and increase default intensity for these recent cards
2025-08-28 06:42:24 +00:00 · 2017-12-04 15:58:07 +01:00 · 2017-12-04 15:58:07 +01:00 · cf886b5907
commit cf886b5907
parent d19e2a1e62
10 changed files with 730 additions and 300 deletions
--- a/Algo256/cuda_keccak256.cu
+++ b/Algo256/cuda_keccak256.cu
@ -1,309 +1,381 @@
-#include "miner.h"
+/**
 * KECCAK-256 CUDA optimised implementation, based on ccminer-alexis code
 */
 #include <miner.h>
 extern "C" {
 #include <stdint.h>
 #include <memory.h>
 }
-#include "cuda_helper.h"
+#include <cuda_helper.h>
 #include <cuda_vectors.h>
-static const uint64_t host_keccak_round_constants[24] = {
+#define TPB52 1024
-	0x0000000000000001ull, 0x0000000000008082ull,
+#define TPB50 384
-	0x800000000000808aull, 0x8000000080008000ull,
+#define NPT 2
-	0x000000000000808bull, 0x0000000080000001ull,
+#define NBN 2
-	0x8000000080008081ull, 0x8000000000008009ull,
+
-	0x000000000000008aull, 0x0000000000000088ull,
+static uint32_t *d_nonces[MAX_GPUS];
-	0x0000000080008009ull, 0x000000008000000aull,
+static uint32_t *h_nonces[MAX_GPUS];
-	0x000000008000808bull, 0x800000000000008bull,
+
-	0x8000000000008089ull, 0x8000000000008003ull,
+__constant__ uint2 c_message48[6];
-	0x8000000000008002ull, 0x8000000000000080ull,
+__constant__ uint2 c_mid[17];
-	0x000000000000800aull, 0x800000008000000aull,
+
-	0x8000000080008081ull, 0x8000000000008080ull,
+__constant__ uint2 keccak_round_constants[24] = {
-	0x0000000080000001ull, 0x8000000080008008ull
+	{ 0x00000001, 0x00000000 }, { 0x00008082, 0x00000000 },	{ 0x0000808a, 0x80000000 }, { 0x80008000, 0x80000000 },
 	{ 0x0000808b, 0x00000000 }, { 0x80000001, 0x00000000 },	{ 0x80008081, 0x80000000 }, { 0x00008009, 0x80000000 },
 	{ 0x0000008a, 0x00000000 }, { 0x00000088, 0x00000000 },	{ 0x80008009, 0x00000000 }, { 0x8000000a, 0x00000000 },
 	{ 0x8000808b, 0x00000000 }, { 0x0000008b, 0x80000000 },	{ 0x00008089, 0x80000000 }, { 0x00008003, 0x80000000 },
 	{ 0x00008002, 0x80000000 }, { 0x00000080, 0x80000000 },	{ 0x0000800a, 0x00000000 }, { 0x8000000a, 0x80000000 },
 	{ 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 },	{ 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 }
 };
 uint32_t *d_nounce[MAX_GPUS];
 uint32_t *d_KNonce[MAX_GPUS];
 __constant__ uint32_t pTarget[8];
 __constant__ uint64_t keccak_round_constants[24];
 __constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
 #if __CUDA_ARCH__ >= 350
 __device__ __forceinline__
-static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
+uint2 xor3x(const uint2 a,const uint2 b,const uint2 c) {
-{
+	uint2 result;
-	size_t i;
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
-	uint2 t[5], u[5], v, w;
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
-
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
 	#pragma unroll
 	for (i = 0; i < 24; i++) {
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		u[0] = t[4] ^ ROL2(t[1], 1);
 		u[1] = t[0] ^ ROL2(t[2], 1);
 		u[2] = t[1] ^ ROL2(t[3], 1);
 		u[3] = t[2] ^ ROL2(t[4], 1);
 		u[4] = t[3] ^ ROL2(t[0], 1);
 		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
 		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
 		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
 		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
 		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
 		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
 		/* rho pi: b[..] = rotl(a[..], ..) */
 		v = s[1];
 		s[1] = ROL2(s[6], 44);
 		s[6] = ROL2(s[9], 20);
 		s[9] = ROL2(s[22], 61);
 		s[22] = ROL2(s[14], 39);
 		s[14] = ROL2(s[20], 18);
 		s[20] = ROL2(s[2], 62);
 		s[2] = ROL2(s[12], 43);
 		s[12] = ROL2(s[13], 25);
 		s[13] = ROL2(s[19], 8);
 		s[19] = ROL2(s[23], 56);
 		s[23] = ROL2(s[15], 41);
 		s[15] = ROL2(s[4], 27);
 		s[4] = ROL2(s[24], 14);
 		s[24] = ROL2(s[21], 2);
 		s[21] = ROL2(s[8], 55);
 		s[8] = ROL2(s[16], 45);
 		s[16] = ROL2(s[5], 36);
 		s[5] = ROL2(s[3], 28);
 		s[3] = ROL2(s[18], 21);
 		s[18] = ROL2(s[17], 15);
 		s[17] = ROL2(s[11], 10);
 		s[11] = ROL2(s[7], 6);
 		s[7] = ROL2(s[10], 3);
 		s[10] = ROL2(v, 1);
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
 		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
 		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
 		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
 		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
 		/* iota: a[0,0] ^= round constant */
 		s[0] ^= vectorize(keccak_round_constants[i]);
 	}
 }
 #else
-
+	result = a^b^c;
 __device__ __forceinline__
 static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint64_t t[5], u[5], v, w;
 	/* absorb input */
 	for (i = 0; i < 24; i++) {
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		u[0] = t[4] ^ ROTL64(t[1], 1);
 		u[1] = t[0] ^ ROTL64(t[2], 1);
 		u[2] = t[1] ^ ROTL64(t[3], 1);
 		u[3] = t[2] ^ ROTL64(t[4], 1);
 		u[4] = t[3] ^ ROTL64(t[0], 1);
 		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
 		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
 		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
 		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
 		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
 		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
 		/* rho pi: b[..] = rotl(a[..], ..) */
 		v = s[ 1];
 		s[ 1] = ROTL64(s[ 6], 44);
 		s[ 6] = ROTL64(s[ 9], 20);
 		s[ 9] = ROTL64(s[22], 61);
 		s[22] = ROTL64(s[14], 39);
 		s[14] = ROTL64(s[20], 18);
 		s[20] = ROTL64(s[ 2], 62);
 		s[ 2] = ROTL64(s[12], 43);
 		s[12] = ROTL64(s[13], 25);
 		s[13] = ROTL64(s[19],  8);
 		s[19] = ROTL64(s[23], 56);
 		s[23] = ROTL64(s[15], 41);
 		s[15] = ROTL64(s[ 4], 27);
 		s[ 4] = ROTL64(s[24], 14);
 		s[24] = ROTL64(s[21],  2);
 		s[21] = ROTL64(s[ 8], 55);
 		s[ 8] = ROTL64(s[16], 45);
 		s[16] = ROTL64(s[ 5], 36);
 		s[ 5] = ROTL64(s[ 3], 28);
 		s[ 3] = ROTL64(s[18], 21);
 		s[18] = ROTL64(s[17], 15);
 		s[17] = ROTL64(s[11], 10);
 		s[11] = ROTL64(s[ 7],  6);
 		s[ 7] = ROTL64(s[10],  3);
 		s[10] = ROTL64(    v,  1);
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
 		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
 		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
 		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
 		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
 		/* iota: a[0,0] ^= round constant */
 		s[0] ^= keccak_round_constants[i];
 	}
 }
 #endif
 __global__ __launch_bounds__(128,5)
 void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 #if __CUDA_ARCH__ >= 350
 		uint2 keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i=0; i<25; i++) {
 			if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]);
 			else     keccak_gpu_state[i] = make_uint2(0, 0);
 		}
 		keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]);
 		keccak_gpu_state[9].y = cuda_swab32(nounce);
 		keccak_gpu_state[10] = make_uint2(1, 0);
 		keccak_gpu_state[16] = make_uint2(0, 0x80000000);
 		keccak_blockv35(keccak_gpu_state,keccak_round_constants);
 		if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;}
 #else
 		uint64_t keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i=0; i<25; i++) {
 			if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
 			else     keccak_gpu_state[i] = 0;
 		}
 		keccak_gpu_state[9]  = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
 		keccak_gpu_state[10] = 0x0000000000000001;
 		keccak_gpu_state[16] = 0x8000000000000000;
 		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
 		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; }
 #endif
 	}
 }
 __host__
 uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
 	uint32_t result = UINT32_MAX;
 	cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 	keccak256_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]);
 	MyStreamSynchronize(NULL, order, thr_id);
 	cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	cudaThreadSynchronize();
 	result = *d_nounce[thr_id];
 	return result;
 }
-__global__ __launch_bounds__(256,3)
+__device__ __forceinline__
-void keccak256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
+uint2 chi(const uint2 a,const uint2 b,const uint2 c) { // keccak chi
-{
+	uint2 result;
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
-	if (thread < threads)
+	asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
-	{
+	asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA)
 #if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
 		uint2 keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i = 0; i<25; i++) {
 			if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
 			else     keccak_gpu_state[i] = make_uint2(0, 0);
 		}
 		keccak_gpu_state[4]  = make_uint2(1, 0);
 		keccak_gpu_state[16] = make_uint2(0, 0x80000000);
 		keccak_blockv35(keccak_gpu_state, keccak_round_constants);
 		#pragma unroll 4
 		for (int i=0; i<4; i++)
 			outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]);
 #else
-		uint64_t keccak_gpu_state[25];
+	result = a ^ (~b) & c;
 #endif
 	return result;
 }
 __device__ __forceinline__
 uint64_t xor5(uint64_t a, uint64_t b, uint64_t c, uint64_t d, uint64_t e)
 {
 	uint64_t result;
 	asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(d) ,"l"(e));
 	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c));
 	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b));
 	asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a));
 	return result;
 }
 #if __CUDA_ARCH__ <= 500
 __global__ __launch_bounds__(TPB50, 2)
 #else
 __global__ __launch_bounds__(TPB52, 1)
 #endif
 void keccak256_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint32_t *resNounce, const uint2 highTarget)
 {
 	uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
 	uint2 s[25], t[5], v, w, u[5];
 #if __CUDA_ARCH__ > 500
 	uint64_t step     = gridDim.x * blockDim.x;
 	uint64_t maxNonce = startNonce + threads;
 	for(uint64_t nounce = startNonce + thread; nounce<maxNonce;nounce+=step) {
 #else
 	uint32_t nounce = startNonce+thread;
 	if(thread<threads) {
 #endif
 		s[ 9] = make_uint2(c_message48[0].x,cuda_swab32(nounce));
 		s[10] = keccak_round_constants[0];
 		t[ 4] = c_message48[1]^s[ 9];
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		u[ 0] = t[4] ^ c_mid[ 0];
 		u[ 1] = c_mid[ 1] ^ ROL2(t[4],1);
 		u[ 2] = c_mid[ 2];
 		/* thetarho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
 		s[ 7] = ROL2(s[10]^u[0], 3);
 		s[10] = c_mid[ 3];
 		    w = c_mid[ 4];
 		s[20] = c_mid[ 5];
 		s[ 6] = ROL2(s[ 9]^u[2],20);
 		s[ 9] = c_mid[ 6];
 		s[22] = c_mid[ 7];
 		s[14] = ROL2(u[0],18);
 		s[ 2] = c_mid[ 8];
 		s[12] = ROL2(u[1],25);
 		s[13] = c_mid[ 9];
 		s[19] = ROR8(u[1]);
 		s[23] = ROR2(u[0],23);
 		s[15] = c_mid[10];
 		s[ 4] = c_mid[11];
 		s[24] = c_mid[12];
 		s[21] = ROR2(c_message48[2]^u[1], 9);
 		s[ 8] = c_mid[13];
 		s[16] = ROR2(c_message48[3]^u[0],28);
 		s[ 5] = ROL2(c_message48[4]^u[1],28);
 		s[ 3] = ROL2(u[1],21);
 		s[18] = c_mid[14];
 		s[17] = c_mid[15];
 		s[11] = c_mid[16];
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		v = c_message48[5]^u[0];
 		s[ 0] = chi(v,w,s[ 2]);
 		s[ 1] = chi(w,s[ 2],s[ 3]);
 		s[ 2] = chi(s[ 2],s[ 3],s[ 4]);
 		s[ 3] = chi(s[ 3],s[ 4],v);
 		s[ 4] = chi(s[ 4],v,w);
 		v = s[ 5]; w = s[ 6]; s[ 5] = chi(v,w,s[ 7]); s[ 6] = chi(w,s[ 7],s[ 8]); s[ 7] = chi(s[ 7],s[ 8],s[ 9]); s[ 8] = chi(s[ 8],s[ 9],v);s[ 9] = chi(s[ 9],v,w);
 		v = s[10]; w = s[11]; s[10] = chi(v,w,s[12]); s[11] = chi(w,s[12],s[13]); s[12] = chi(s[12],s[13],s[14]); s[13] = chi(s[13],s[14],v);s[14] = chi(s[14],v,w);
 		v = s[15]; w = s[16]; s[15] = chi(v,w,s[17]); s[16] = chi(w,s[17],s[18]); s[17] = chi(s[17],s[18],s[19]); s[18] = chi(s[18],s[19],v);s[19] = chi(s[19],v,w);
 		v = s[20]; w = s[21]; s[20] = chi(v,w,s[22]); s[21] = chi(w,s[22],s[23]); s[22] = chi(s[22],s[23],s[24]); s[23] = chi(s[23],s[24],v);s[24] = chi(s[24],v,w);
 		/* iota: a[0,0] ^= round constant */
 		s[ 0] ^=keccak_round_constants[ 0];
 		#if __CUDA_ARCH__ > 500
 			#pragma unroll 22
 		#else
 			#pragma unroll 4
 		#endif
 		for (int i = 1; i < 23; i++) {
 			#pragma unroll
 			for(int j=0;j<5;j++) {
 				t[ j] = vectorize(xor5(devectorize(s[ j]),devectorize(s[j+5]),devectorize(s[j+10]),devectorize(s[j+15]),devectorize(s[j+20])));
 			}
 			/*theta*/
 			#pragma unroll
 			for(int j=0;j<5;j++) {
 				u[j] = ROL2(t[j], 1);
 			}
 			s[ 4] = xor3x(s[ 4], t[3], u[0]);s[ 9] = xor3x(s[ 9], t[3], u[0]);s[14] = xor3x(s[14], t[3], u[0]);s[19] = xor3x(s[19], t[3], u[0]);s[24] = xor3x(s[24], t[3], u[0]);
 			s[ 0] = xor3x(s[ 0], t[4], u[1]);s[ 5] = xor3x(s[ 5], t[4], u[1]);s[10] = xor3x(s[10], t[4], u[1]);s[15] = xor3x(s[15], t[4], u[1]);s[20] = xor3x(s[20], t[4], u[1]);
 			s[ 1] = xor3x(s[ 1], t[0], u[2]);s[ 6] = xor3x(s[ 6], t[0], u[2]);s[11] = xor3x(s[11], t[0], u[2]);s[16] = xor3x(s[16], t[0], u[2]);s[21] = xor3x(s[21], t[0], u[2]);
 			s[ 2] = xor3x(s[ 2], t[1], u[3]);s[ 7] = xor3x(s[ 7], t[1], u[3]);s[12] = xor3x(s[12], t[1], u[3]);s[17] = xor3x(s[17], t[1], u[3]);s[22] = xor3x(s[22], t[1], u[3]);
 			s[ 3] = xor3x(s[ 3], t[2], u[4]);s[ 8] = xor3x(s[ 8], t[2], u[4]);s[13] = xor3x(s[13], t[2], u[4]);s[18] = xor3x(s[18], t[2], u[4]);s[23] = xor3x(s[23], t[2], u[4]);
 			/*rho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
 			v = s[ 1];
 			s[ 1] = ROL2(s[ 6],44);	s[ 6] = ROL2(s[ 9],20);	s[ 9] = ROL2(s[22],61);	s[22] = ROL2(s[14],39);
 			s[14] = ROL2(s[20],18);	s[20] = ROL2(s[ 2],62);	s[ 2] = ROL2(s[12],43);	s[12] = ROL2(s[13],25);
 			s[13] = ROL8(s[19]);	s[19] = ROR8(s[23]);	s[23] = ROL2(s[15],41);	s[15] = ROL2(s[ 4],27);
 			s[ 4] = ROL2(s[24],14);	s[24] = ROL2(s[21], 2);	s[21] = ROL2(s[ 8],55);	s[ 8] = ROL2(s[16],45);
 			s[16] = ROL2(s[ 5],36);	s[ 5] = ROL2(s[ 3],28);	s[ 3] = ROL2(s[18],21);	s[18] = ROL2(s[17],15);
 			s[17] = ROL2(s[11],10);	s[11] = ROL2(s[ 7], 6);	s[ 7] = ROL2(s[10], 3);	s[10] = ROL2(v, 1);
 			/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 			#pragma unroll
 			for(int j=0;j<25;j+=5) {
 				v=s[j];w=s[j + 1];s[j] = chi(s[j],s[j+1],s[j+2]);s[j+1] = chi(s[j+1],s[j+2],s[j+3]);s[j+2]=chi(s[j+2],s[j+3],s[j+4]);s[j+3]=chi(s[j+3],s[j+4],v);s[j+4]=chi(s[j+4],v,w);
 			}
 			/* iota: a[0,0] ^= round constant */
 			s[ 0] ^=keccak_round_constants[ i];
 		}
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		#pragma unroll 5
 		for(int j=0;j<5;j++) {
 			t[ j] = xor3x(xor3x(s[j+0],s[j+5],s[j+10]), s[j+15], s[j+20]);
 		}
 		s[24] = xor3x(s[24],t[3],ROL2(t[0],1));
 		s[18] = xor3x(s[18],t[2],ROL2(t[4],1));
 		s[ 0] = xor3x(s[ 0],t[4],ROL2(t[1],1));
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		s[24] = ROL2(s[24],14);
 		s[18] = ROL2(s[18],21);
 		if (devectorize(chi(s[18],s[24],s[ 0])) <= devectorize(highTarget)) {
 //		if(chi(s[18].x,s[24].x,s[0].x)<=highTarget.x) {
 //			if(chi(s[18].y,s[24].y,s[0].y)<=highTarget.y) {
 				const uint32_t tmp = atomicExch(&resNounce[0], nounce);
 				if (tmp != UINT32_MAX)
 					resNounce[1] = tmp;
 	//			return;
 //			}
 		}
 	}
 }
 __host__
 void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, const uint2 highTarget)
 {
 	uint32_t tpb;
 	dim3 grid;
 	if (device_sm[device_map[thr_id]] <= 500) {
 		tpb = TPB50;
 		grid.x = (threads + tpb-1)/tpb;
 	} else {
 		tpb = TPB52;
 		grid.x = (threads + (NPT*tpb)-1)/(NPT*tpb);
 	}
 	const dim3 block(tpb);
 	keccak256_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_nonces[thr_id], highTarget);
 //	cudaThreadSynchronize();
 	cudaMemcpy(h_nonces[thr_id], d_nonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	memcpy(resNonces, h_nonces[thr_id], NBN*sizeof(uint32_t));
 }
 #if __CUDA_ARCH__ <= 500
 __global__ __launch_bounds__(TPB50, 2)
 #else
 __global__ __launch_bounds__(TPB52, 1)
 #endif
 void keccak256_gpu_hash_32(uint32_t threads, uint2* outputHash)
 {
 	uint32_t thread   = blockDim.x * blockIdx.x + threadIdx.x;
 	uint2 s[25], t[5], v, w, u[5];
 	if(thread < threads) {
 		#pragma unroll 25
 		for (int i = 0; i<25; i++) {
-			if (i<4)
+			if (i<4) s[i] = __ldg(&outputHash[i*threads+thread]);
-				keccak_gpu_state[i] = outputHash[i*threads+thread];
+			else     s[i] = make_uint2(0, 0);
 			else
 				keccak_gpu_state[i] = 0;
 		}
-		keccak_gpu_state[4]  = 0x0000000000000001;
+		s[4]  = keccak_round_constants[ 0];
-		keccak_gpu_state[16] = 0x8000000000000000;
+		s[16] = make_uint2(0, 0x80000000);
-
+		#if __CUDA_ARCH__ > 500
-		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
+			#pragma unroll
 		#else
 			#pragma unroll 4
-		for (int i = 0; i<4; i++)
+		#endif
-			outputHash[i*threads + thread] = keccak_gpu_state[i];
+		for (uint32_t i = 0; i < 23; i++) {
-#endif
+			/*theta*/
 			#pragma unroll 5
 			for(int j=0; j<5; j++) {
 				t[ j] = vectorize(xor5(devectorize(s[ j]),devectorize(s[j+5]),devectorize(s[j+10]),devectorize(s[j+15]),devectorize(s[j+20])));
 			}
 			/*theta*/
 			#pragma unroll 5
 			for(int j=0; j<5; j++) {
 				u[j] = ROL2(t[j], 1);
 			}
 			s[ 4] = xor3x(s[ 4], t[3], u[0]);s[ 9] = xor3x(s[ 9], t[3], u[0]);s[14] = xor3x(s[14], t[3], u[0]);s[19] = xor3x(s[19], t[3], u[0]);s[24] = xor3x(s[24], t[3], u[0]);
 			s[ 0] = xor3x(s[ 0], t[4], u[1]);s[ 5] = xor3x(s[ 5], t[4], u[1]);s[10] = xor3x(s[10], t[4], u[1]);s[15] = xor3x(s[15], t[4], u[1]);s[20] = xor3x(s[20], t[4], u[1]);
 			s[ 1] = xor3x(s[ 1], t[0], u[2]);s[ 6] = xor3x(s[ 6], t[0], u[2]);s[11] = xor3x(s[11], t[0], u[2]);s[16] = xor3x(s[16], t[0], u[2]);s[21] = xor3x(s[21], t[0], u[2]);
 			s[ 2] = xor3x(s[ 2], t[1], u[3]);s[ 7] = xor3x(s[ 7], t[1], u[3]);s[12] = xor3x(s[12], t[1], u[3]);s[17] = xor3x(s[17], t[1], u[3]);s[22] = xor3x(s[22], t[1], u[3]);
 			s[ 3] = xor3x(s[ 3], t[2], u[4]);s[ 8] = xor3x(s[ 8], t[2], u[4]);s[13] = xor3x(s[13], t[2], u[4]);s[18] = xor3x(s[18], t[2], u[4]);s[23] = xor3x(s[23], t[2], u[4]);
 			/*rho pi: b[..] = rotl(a[..] ^ d[...], ..)*/
 			v = s[ 1];
 			s[ 1] = ROL2(s[ 6],44); s[ 6] = ROL2(s[ 9],20); s[ 9] = ROL2(s[22],61); s[22] = ROL2(s[14],39);
 			s[14] = ROL2(s[20],18); s[20] = ROL2(s[ 2],62); s[ 2] = ROL2(s[12],43); s[12] = ROL2(s[13],25);
 			s[13] = ROL8(s[19]);    s[19] = ROR8(s[23]);    s[23] = ROL2(s[15],41); s[15] = ROL2(s[ 4],27);
 			s[ 4] = ROL2(s[24],14); s[24] = ROL2(s[21], 2); s[21] = ROL2(s[ 8],55); s[ 8] = ROL2(s[16],45);
 			s[16] = ROL2(s[ 5],36); s[ 5] = ROL2(s[ 3],28); s[ 3] = ROL2(s[18],21); s[18] = ROL2(s[17],15);
 			s[17] = ROL2(s[11],10); s[11] = ROL2(s[ 7], 6); s[ 7] = ROL2(s[10], 3); s[10] = ROL2(v, 1);
 			/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 			#pragma unroll 5
 			for(int j=0; j<25; j+=5) {
 				v=s[j];w=s[j + 1]; s[j] = chi(v,w,s[j+2]); s[j+1] = chi(w,s[j+2],s[j+3]); s[j+2]=chi(s[j+2],s[j+3],s[j+4]); s[j+3]=chi(s[j+3],s[j+4],v); s[j+4]=chi(s[j+4],v,w);
 			}
 			/* iota: a[0,0] ^= round constant */
 			s[ 0] ^=keccak_round_constants[ i];
 		}
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		#pragma unroll 5
 		for(int j=0;j<5;j++) {
 			t[ j] = xor3x(xor3x(s[j+0],s[j+5],s[j+10]), s[j+15], s[j+20]);
 		}
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		#pragma unroll 5
 		for(int j=0;j<5;j++) {
 			u[j] = ROL2(t[j],1);
 		}
 		/* thetarho pi: b[..] = rotl(a[..] ^ d[...], ..) //There's no need to perform theta and -store- the result since it's unique for each a[..]*/
 		s[ 4] = xor3x(s[24], t[3], u[0]);
 		s[ 0] = xor3x(s[ 0], t[4], u[1]);
 		s[ 1] = xor3x(s[ 6], t[0], u[2]);
 		s[ 2] = xor3x(s[12], t[1], u[3]);
 		s[ 3] = xor3x(s[18], t[2], u[4]);
 		s[ 1] = ROR2(s[ 1],20);
 		s[ 2] = ROR2(s[ 2],21);
 		s[ 3] = ROL2(s[ 3],21);
 		s[ 4] = ROL2(s[ 4],14);
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		outputHash[0*threads+thread] = chi(s[ 0],s[ 1],s[ 2]) ^ keccak_round_constants[23];
 		outputHash[1*threads+thread] = chi(s[ 1],s[ 2],s[ 3]);
 		outputHash[2*threads+thread] = chi(s[ 2],s[ 3],s[ 4]);
 		outputHash[3*threads+thread] = chi(s[ 3],s[ 4],s[ 0]);
 	}
 }
 __host__
-void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+void keccak256_cpu_hash_32(const int thr_id,const uint32_t threads, uint2* d_hash)
 {
-	const uint32_t threadsperblock = 256;
+	uint32_t tpb = TPB52;
 	if (device_sm[device_map[thr_id]] == 500) tpb = TPB50;
 	const dim3 grid((threads + tpb-1)/tpb);
 	const dim3 block(tpb);
-	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	keccak256_gpu_hash_32 <<<grid, block>>> (threads, d_hash);
 	dim3 block(threadsperblock);
 	keccak256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 __host__
-void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
+void keccak256_setBlock_80(uint64_t *endiandata)
 {
-	unsigned char PaddedMessage[80];
+	uint64_t midstate[17], s[25];
-	memcpy(PaddedMessage, pdata, 80);
+	uint64_t t[5], u[5];
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+	s[10] = 1; //(uint64_t)make_uint2(1, 0);
 	s[16] = ((uint64_t)1)<<63; //(uint64_t)make_uint2(0, 0x80000000);
 	t[0] = endiandata[0] ^ endiandata[5] ^ s[10];
 	t[1] = endiandata[1] ^ endiandata[6] ^ s[16];
 	t[2] = endiandata[2] ^ endiandata[7];
 	t[3] = endiandata[3] ^ endiandata[8];
 	midstate[ 0] = ROTL64(t[1], 1);         //u[0] -partial
 	       u[1] = t[ 0] ^ ROTL64(t[2], 1);  //u[1]
 	       u[2] = t[ 1] ^ ROTL64(t[3], 1);  //u[2]
 	midstate[ 1] = t[ 2];                   //u[3] -partial
 	midstate[ 2] = t[ 3] ^ ROTL64(t[0], 1); //u[4]
 	midstate[ 3] = ROTL64(endiandata[1]^u[1], 1); //v
 	midstate[ 4] = ROTL64(endiandata[6]^u[1], 44);
 	midstate[ 5] = ROTL64(endiandata[2]^u[2], 62);
 	midstate[ 6] = ROTL64(u[2], 61);
 	midstate[ 7] = ROTL64(midstate[2], 39);
 	midstate[ 8] = ROTL64(u[2], 43);
 	midstate[ 9] = ROTL64(midstate[2], 8);
 	midstate[10] = ROTL64(endiandata[4]^midstate[ 2],27);
 	midstate[11] = ROTL64(midstate[2], 14);
 	midstate[12] = ROTL64(u[1], 2);
 	midstate[13] = ROTL64(s[16] ^ u[1], 45);
 	midstate[14] = ROTL64(u[2],15);
 	midstate[15] = ROTL64(u[1],10);
 	midstate[16] = ROTL64(endiandata[7]^u[2], 6);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_mid, midstate,17*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 	// pass only what's needed
 	uint64_t message48[6];
 	message48[0] = endiandata[9];
 	message48[1] = endiandata[4];
 	message48[2] = endiandata[8];
 	message48[3] = endiandata[5];
 	message48[4] = endiandata[3];
 	message48[5] = endiandata[0];
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_message48, message48, 6*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 }
 __host__
-void keccak256_cpu_init(int thr_id, uint32_t threads)
+void keccak256_cpu_init(int thr_id)
 {
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
+	CUDA_SAFE_CALL(cudaMalloc(&d_nonces[thr_id], NBN*sizeof(uint32_t)));
-				sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
+	//CUDA_SAFE_CALL(cudaMallocHost(&h_nonces[thr_id], NBN*sizeof(uint32_t)));
-	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
+	h_nonces[thr_id] = (uint32_t*) malloc(NBN * sizeof(uint32_t));
-	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
+	if(h_nonces[thr_id] == NULL) {
 		gpulog(LOG_ERR,thr_id,"Host memory allocation failed");
 		exit(EXIT_FAILURE);
 	}
 }
 __host__
 void keccak256_setOutput(int thr_id)
 {
 	CUDA_SAFE_CALL(cudaMemset(d_nonces[thr_id], 0xff, NBN*sizeof(uint32_t)));
 }
 __host__
 void keccak256_cpu_free(int thr_id)
 {
-	cudaFree(d_KNonce[thr_id]);
+	cudaFree(d_nonces[thr_id]);
-	cudaFreeHost(d_nounce[thr_id]);
+	//cudaFreeHost(h_nonces[thr_id]);
 	free(h_nonces[thr_id]);
 }
--- a/Algo256/cuda_keccak256_sm3.cu
+++ b/Algo256/cuda_keccak256_sm3.cu
@ -0,0 +1,309 @@
 #include "miner.h"
 extern "C" {
 #include <stdint.h>
 #include <memory.h>
 }
 #include "cuda_helper.h"
 static const uint64_t host_keccak_round_constants[24] = {
 	0x0000000000000001ull, 0x0000000000008082ull,
 	0x800000000000808aull, 0x8000000080008000ull,
 	0x000000000000808bull, 0x0000000080000001ull,
 	0x8000000080008081ull, 0x8000000000008009ull,
 	0x000000000000008aull, 0x0000000000000088ull,
 	0x0000000080008009ull, 0x000000008000000aull,
 	0x000000008000808bull, 0x800000000000008bull,
 	0x8000000000008089ull, 0x8000000000008003ull,
 	0x8000000000008002ull, 0x8000000000000080ull,
 	0x000000000000800aull, 0x800000008000000aull,
 	0x8000000080008081ull, 0x8000000000008080ull,
 	0x0000000080000001ull, 0x8000000080008008ull
 };
 static uint32_t *d_nounce[MAX_GPUS];
 static uint32_t *d_KNonce[MAX_GPUS];
 __constant__ uint32_t pTarget[8];
 __constant__ uint64_t keccak_round_constants[24];
 __constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
 #if __CUDA_ARCH__ >= 350
 __device__ __forceinline__
 static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint2 t[5], u[5], v, w;
 	#pragma unroll
 	for (i = 0; i < 24; i++) {
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		u[0] = t[4] ^ ROL2(t[1], 1);
 		u[1] = t[0] ^ ROL2(t[2], 1);
 		u[2] = t[1] ^ ROL2(t[3], 1);
 		u[3] = t[2] ^ ROL2(t[4], 1);
 		u[4] = t[3] ^ ROL2(t[0], 1);
 		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
 		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
 		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
 		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
 		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
 		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
 		/* rho pi: b[..] = rotl(a[..], ..) */
 		v = s[1];
 		s[1] = ROL2(s[6], 44);
 		s[6] = ROL2(s[9], 20);
 		s[9] = ROL2(s[22], 61);
 		s[22] = ROL2(s[14], 39);
 		s[14] = ROL2(s[20], 18);
 		s[20] = ROL2(s[2], 62);
 		s[2] = ROL2(s[12], 43);
 		s[12] = ROL2(s[13], 25);
 		s[13] = ROL2(s[19], 8);
 		s[19] = ROL2(s[23], 56);
 		s[23] = ROL2(s[15], 41);
 		s[15] = ROL2(s[4], 27);
 		s[4] = ROL2(s[24], 14);
 		s[24] = ROL2(s[21], 2);
 		s[21] = ROL2(s[8], 55);
 		s[8] = ROL2(s[16], 45);
 		s[16] = ROL2(s[5], 36);
 		s[5] = ROL2(s[3], 28);
 		s[3] = ROL2(s[18], 21);
 		s[18] = ROL2(s[17], 15);
 		s[17] = ROL2(s[11], 10);
 		s[11] = ROL2(s[7], 6);
 		s[7] = ROL2(s[10], 3);
 		s[10] = ROL2(v, 1);
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
 		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
 		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
 		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
 		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
 		/* iota: a[0,0] ^= round constant */
 		s[0] ^= vectorize(keccak_round_constants[i]);
 	}
 }
 #else
 __device__ __forceinline__
 static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint64_t t[5], u[5], v, w;
 	/* absorb input */
 	for (i = 0; i < 24; i++) {
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		u[0] = t[4] ^ ROTL64(t[1], 1);
 		u[1] = t[0] ^ ROTL64(t[2], 1);
 		u[2] = t[1] ^ ROTL64(t[3], 1);
 		u[3] = t[2] ^ ROTL64(t[4], 1);
 		u[4] = t[3] ^ ROTL64(t[0], 1);
 		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
 		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
 		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
 		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
 		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
 		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
 		/* rho pi: b[..] = rotl(a[..], ..) */
 		v = s[ 1];
 		s[ 1] = ROTL64(s[ 6], 44);
 		s[ 6] = ROTL64(s[ 9], 20);
 		s[ 9] = ROTL64(s[22], 61);
 		s[22] = ROTL64(s[14], 39);
 		s[14] = ROTL64(s[20], 18);
 		s[20] = ROTL64(s[ 2], 62);
 		s[ 2] = ROTL64(s[12], 43);
 		s[12] = ROTL64(s[13], 25);
 		s[13] = ROTL64(s[19],  8);
 		s[19] = ROTL64(s[23], 56);
 		s[23] = ROTL64(s[15], 41);
 		s[15] = ROTL64(s[ 4], 27);
 		s[ 4] = ROTL64(s[24], 14);
 		s[24] = ROTL64(s[21],  2);
 		s[21] = ROTL64(s[ 8], 55);
 		s[ 8] = ROTL64(s[16], 45);
 		s[16] = ROTL64(s[ 5], 36);
 		s[ 5] = ROTL64(s[ 3], 28);
 		s[ 3] = ROTL64(s[18], 21);
 		s[18] = ROTL64(s[17], 15);
 		s[17] = ROTL64(s[11], 10);
 		s[11] = ROTL64(s[ 7],  6);
 		s[ 7] = ROTL64(s[10],  3);
 		s[10] = ROTL64(    v,  1);
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
 		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
 		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
 		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
 		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
 		/* iota: a[0,0] ^= round constant */
 		s[0] ^= keccak_round_constants[i];
 	}
 }
 #endif
 __global__ __launch_bounds__(128,5)
 void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 #if __CUDA_ARCH__ >= 350
 		uint2 keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i=0; i<25; i++) {
 			if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]);
 			else     keccak_gpu_state[i] = make_uint2(0, 0);
 		}
 		keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]);
 		keccak_gpu_state[9].y = cuda_swab32(nounce);
 		keccak_gpu_state[10] = make_uint2(1, 0);
 		keccak_gpu_state[16] = make_uint2(0, 0x80000000);
 		keccak_blockv35(keccak_gpu_state,keccak_round_constants);
 		if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;}
 #else
 		uint64_t keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i=0; i<25; i++) {
 			if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i];
 			else     keccak_gpu_state[i] = 0;
 		}
 		keccak_gpu_state[9]  = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
 		keccak_gpu_state[10] = 0x0000000000000001;
 		keccak_gpu_state[16] = 0x8000000000000000;
 		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
 		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; }
 #endif
 	}
 }
 __host__
 uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
 	uint32_t result = UINT32_MAX;
 	cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 	keccak256_sm3_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]);
 	MyStreamSynchronize(NULL, order, thr_id);
 	cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	cudaThreadSynchronize();
 	result = *d_nounce[thr_id];
 	return result;
 }
 __global__ __launch_bounds__(256,3)
 void keccak256_sm3_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 #if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
 		uint2 keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i = 0; i<25; i++) {
 			if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
 			else     keccak_gpu_state[i] = make_uint2(0, 0);
 		}
 		keccak_gpu_state[4]  = make_uint2(1, 0);
 		keccak_gpu_state[16] = make_uint2(0, 0x80000000);
 		keccak_blockv35(keccak_gpu_state, keccak_round_constants);
 		#pragma unroll 4
 		for (int i=0; i<4; i++)
 			outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]);
 #else
 		uint64_t keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i = 0; i<25; i++) {
 			if (i<4)
 				keccak_gpu_state[i] = outputHash[i*threads+thread];
 			else
 				keccak_gpu_state[i] = 0;
 		}
 		keccak_gpu_state[4]  = 0x0000000000000001;
 		keccak_gpu_state[16] = 0x8000000000000000;
 		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
 		#pragma unroll 4
 		for (int i = 0; i<4; i++)
 			outputHash[i*threads + thread] = keccak_gpu_state[i];
 #endif
 	}
 }
 __host__
 void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const uint32_t threadsperblock = 256;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	keccak256_sm3_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 __host__
 void keccak256_sm3_setBlock_80(void *pdata,const void *pTargetIn)
 {
 	unsigned char PaddedMessage[80];
 	memcpy(PaddedMessage, pdata, 80);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 }
 __host__
 void keccak256_sm3_init(int thr_id, uint32_t threads)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
 				sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
 	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
 }
 __host__
 void keccak256_sm3_free(int thr_id)
 {
 	cudaFree(d_KNonce[thr_id]);
 	cudaFreeHost(d_nounce[thr_id]);
 }
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@ -16,10 +16,18 @@ extern "C"
 static uint32_t *d_hash[MAX_GPUS];
-extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+// SM5+ cuda
 extern void keccak256_cpu_init(int thr_id);
 extern void keccak256_cpu_free(int thr_id);
-extern void keccak256_setBlock_80(void *pdata,const void *ptarget);
+extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, const uint2 highTarget);
-extern uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void keccak256_setBlock_80(uint64_t *endiandata);
 extern void keccak256_setOutput(int thr_id);
 // compat
 extern void keccak256_sm3_init(int thr_id, uint32_t threads);
 extern void keccak256_sm3_free(int thr_id);
 extern void keccak256_sm3_setBlock_80(void *pdata, const void *ptarget);
 extern uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
 // CPU Hash
 extern "C" void keccak256_hash(void *state, const void *input)
@ -35,6 +43,7 @@ extern "C" void keccak256_hash(void *state, const void *input)
 }
 static bool init[MAX_GPUS] = { 0 };
 static bool use_compat_kernels[MAX_GPUS] = { 0 };
 extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
@ -42,6 +51,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const int dev_id = device_map[thr_id];
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21); // 256*256*8*4
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -50,17 +60,28 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
-		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+		cuda_get_arch(thr_id);
 		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
 		if(!use_compat_kernels[thr_id]) {
 			uint32_t intensity = 23;
 			if (strstr(device_name[dev_id], "GTX 1080")) intensity = 25;
 			throughput = cuda_default_throughput(thr_id, 1U << intensity);
 			keccak256_cpu_init(thr_id);
 		} else {
 			// really useful ?
 			CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
-		keccak256_cpu_init(thr_id, throughput);
+			keccak256_sm3_init(thr_id, throughput);
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		init[thr_id] = true;
 	}
@ -69,13 +90,25 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 		be32enc(&endiandata[k], pdata[k]);
 	}
-	keccak256_setBlock_80((void*)endiandata, ptarget);
+	const uint2 highTarget = make_uint2(ptarget[6], ptarget[7]);
 	if(use_compat_kernels[thr_id])
 		keccak256_sm3_setBlock_80((void*)endiandata, ptarget);
 	else {
 		keccak256_setBlock_80((uint64_t*)endiandata);
 		keccak256_setOutput(thr_id);
 	}
 	do {
 		int order = 0;
 		*hashes_done = pdata[19] - first_nonce + throughput;
-		work->nonces[0] = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if(use_compat_kernels[thr_id])
 			work->nonces[0] = keccak256_sm3_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		else {
 			keccak256_cpu_hash_80(thr_id, throughput, pdata[19], work->nonces, highTarget);
 		}
 		if (work->nonces[0] != UINT32_MAX && bench_algo < 0)
 		{
 			const uint32_t Htarg = ptarget[7];
@ -95,6 +128,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 				if (!opt_quiet)
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
 				pdata[19] = work->nonces[0] + 1;
 				//keccak256_setOutput(thr_id);
 				continue;
 			}
 		}
@ -120,9 +154,12 @@ extern "C" void free_keccak256(int thr_id)
 	cudaThreadSynchronize();
-	cudaFree(d_hash[thr_id]);
+	if(!use_compat_kernels[thr_id])
 		keccak256_cpu_free(thr_id);
 	else {
 		cudaFree(d_hash[thr_id]);
 		keccak256_sm3_free(thr_id);
 	}
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
--- a/Makefile.am
+++ b/Makefile.am
@ -40,7 +40,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 		          lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
 			  lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
 			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
-			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
+			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu \
 			  Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
 			  Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
 			  Algo256/blake2s.cu sph/blake2s.c \
 			  Algo256/bmw.cu Algo256/cuda_bmw.cu \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccminer 2.2.3-dev (Nov. 2017) "polytimos algo"
+ccminer 2.2.3 (Dec. 2017)    "polytimos algo and keccakc (opt)"
 ---------------------------------------------------------------
 ***************************************************************
@ -93,6 +93,7 @@ its command line interface and options.
                          hsr         use to mine Hshare
                          jackpot     use to mine Sweepcoin
                          keccak      use to mine Maxcoin
                          keccakc     use to mine CreativeCoin
                          lbry        use to mine LBRY Credits
                          luffa       use to mine Joincoin
                          lyra2       use to mine CryptoCoin
@ -281,8 +282,10 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 >>> RELEASE HISTORY <<<
-  Nov. 16th 2017  v2.2.3
+  Dec. 04th 2017  v2.2.3
                  Polytimos Algo
                  Handle keccakc variant (with refreshed sha256d merkle)
                  Optimised keccak for SM5+, based on alexis improvements
  Oct. 09th 2017  v2.2.2
                  Import and clean the hsr algo (x13 + custom hash)
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -255,7 +255,8 @@ Options:\n\
 			heavy       Heavycoin\n\
 			hmq1725     Doubloons / Espers\n\
 			jackpot     JHA v8\n\
-			keccak      Keccak-256 (Maxcoin)\n\
+			keccak      Deprecated Keccak-256\n\
 			keccakc     Keccak-256 (CreativeCoin)\n\
 			lbry        LBRY Credits (Sha/Ripemd)\n\
 			luffa       Joincoin\n\
 			lyra2       CryptoCoin\n\
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -475,7 +475,8 @@
    <CudaCompile Include="Algo256\cuda_cubehash256.cu" />
    <CudaCompile Include="Algo256\cuda_fugue256.cu" />
    <CudaCompile Include="Algo256\cuda_groestl256.cu" />
-    <CudaCompile Include="Algo256\cuda_keccak256.cu">
+    <CudaCompile Include="Algo256\cuda_keccak256.cu" />
    <CudaCompile Include="Algo256\cuda_keccak256_sm3.cu">
      <MaxRegCount>92</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="Algo256\cuda_skein256.cu" />
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -844,6 +844,9 @@
    <CudaCompile Include="Algo256\cuda_keccak256.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
    <CudaCompile Include="Algo256\cuda_keccak256_sm3.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
    <CudaCompile Include="Algo256\cuda_skein256.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -16,9 +16,10 @@ extern void blake256_cpu_init(int thr_id, uint32_t threads);
 extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
 extern void blake256_cpu_setBlock_80(uint32_t *pdata);
-extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
-extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+extern void keccak256_sm3_init(int thr_id, uint32_t threads);
-extern void keccak256_cpu_free(int thr_id);
+extern void keccak256_sm3_free(int thr_id);
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
@ -97,7 +98,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		blake256_cpu_init(thr_id, throughput);
-		keccak256_cpu_init(thr_id, throughput);
+		keccak256_sm3_init(thr_id, throughput);
 		skein256_cpu_init(thr_id, throughput);
 		groestl256_cpu_init(thr_id, throughput);
@ -124,7 +125,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		int order = 0;
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -186,7 +187,7 @@ extern "C" void free_lyra2(int thr_id)
 	cudaFree(d_hash[thr_id]);
 	cudaFree(d_matrix[thr_id]);
-	keccak256_cpu_free(thr_id);
+	keccak256_sm3_free(thr_id);
 	groestl256_cpu_free(thr_id);
 	init[thr_id] = false;
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@ -16,9 +16,11 @@ static uint64_t* d_matrix[MAX_GPUS];
 extern void blake256_cpu_init(int thr_id, uint32_t threads);
 extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
 extern void blake256_cpu_setBlock_80(uint32_t *pdata);
-extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+
-extern void keccak256_cpu_init(int thr_id, uint32_t threads);
+extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
-extern void keccak256_cpu_free(int thr_id);
+extern void keccak256_sm3_init(int thr_id, uint32_t threads);
 extern void keccak256_sm3_free(int thr_id);
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
@ -101,7 +103,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		blake256_cpu_init(thr_id, throughput);
-		keccak256_cpu_init(thr_id,throughput);
+		keccak256_sm3_init(thr_id,throughput);
 		skein256_cpu_init(thr_id, throughput);
 		bmw256_cpu_init(thr_id, throughput);
@ -129,7 +131,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 		int order = 0;
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -194,7 +196,7 @@ extern "C" void free_lyra2v2(int thr_id)
 	cudaFree(d_matrix[thr_id]);
 	bmw256_cpu_free(thr_id);
-	keccak256_cpu_free(thr_id);
+	keccak256_sm3_free(thr_id);
 	init[thr_id] = false;