handle new cryptonight variants, stellite, aeon

special thanks for klausT changes and ystarnaud who helped me to adapt my kernel variants... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
6 years ago · 77c4b8724e
20 changed files with 593 additions and 259 deletions
--- a/algos.h
+++ b/algos.h
@ -72,6 +72,9 @@ enum sha_algos {
 	ALGO_WHIRLPOOLX,
 	ALGO_WILDKECCAK,
 	ALGO_ZR5,
 	ALGO_MONERO,
 	ALGO_GRAFT,
 	ALGO_STELLITE,
 	ALGO_AUTO,
 	ALGO_COUNT
 };
@ -146,6 +149,9 @@ static const char *algo_names[] = {
 	"whirlpoolx",
 	"wildkeccak",
 	"zr5",
 	"monero",
 	"graft",
 	"stellite",
 	"auto", /* reserved for multi algo */
 	""
 };
@ -206,4 +212,29 @@ static inline int algo_to_int(char* arg)
 	return i;
 }
 static inline int get_cryptonight_algo(int fork)
 {
 	int algo = ALGO_COUNT;
 	switch (fork) {
 		case 8:
 			algo = ALGO_GRAFT;
 			break;
 		case 7:
 			algo = ALGO_MONERO;
 			break;
 		case 3:
 			algo = ALGO_STELLITE;
 			break;
 		default:
 			algo = ALGO_CRYPTONIGHT;
 			break;
 	}
 	return algo;
 }
 #endif
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -233,6 +233,8 @@ int opt_api_mcast_port = 4068;
 bool opt_stratum_stats = false;
 int cryptonight_fork = 1;
 static char const usage[] = "\
 Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
@ -245,7 +247,7 @@ Options:\n\
 			blakecoin   Fast Blake 256 (8 rounds)\n\
 			bmw         BMW 256\n\
 			cryptolight AEON cryptonight (MEM/2)\n\
-			cryptonight XMR cryptonight\n\
+			cryptonight XMR cryptonight v1 (old)\n\
 			c11/flax    X11 variant\n\
 			decred      Decred Blake256\n\
 			deep        Deepcoin\n\
@ -253,6 +255,7 @@ Options:\n\
 			dmd-gr      Diamond-Groestl\n\
 			fresh       Freshcoin (shavite 80)\n\
 			fugue256    Fuguecoin\n\
 			graft       Cryptonight v8\n\
 			groestl     Groestlcoin\n"
 #ifdef WITH_HEAVY_ALGO
 "			heavy       Heavycoin\n"
@ -267,6 +270,7 @@ Options:\n\
 			lyra2v2     VertCoin\n\
 			lyra2z      ZeroCoin (3rd impl)\n\
 			myr-gr      Myriad-Groestl\n\
 			monero      XMR cryptonight (v7)\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
@ -284,6 +288,7 @@ Options:\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
 			skunk       Skein Cube Fugue Streebog\n\
 			stellite    Cryptonight v3\n\
 			s3          S3 (1Coin)\n\
 			timetravel  Machinecoin permuted x8\n\
 			tribus      Denarius\n\
@ -573,7 +578,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work);
 void get_currentalgo(char* buf, int sz)
 {
-	snprintf(buf, sz, "%s", algo_names[opt_algo]);
+	int algo = opt_algo;
 	if (algo == ALGO_CRYPTONIGHT)
 		algo = get_cryptonight_algo(cryptonight_fork);
 	snprintf(buf, sz, "%s", algo_names[algo]);
 }
 void format_hashrate(double hashrate, char *output)
@ -2372,11 +2380,16 @@ static void *miner_thread(void *userdata)
 			rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_CRYPTOLIGHT:
-			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);
+			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
 			break;
 		case ALGO_CRYPTONIGHT:
-			rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
+		{
 			int cn_variant = 0;
 			if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork)
 				cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1;
 			rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant);
 			break;
 		}
 		case ALGO_DECRED:
 			rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -3138,6 +3151,26 @@ void parse_arg(int key, char *arg)
 			case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
 			}
 		}
 		// cryptonight variants
 		switch (opt_algo) {
 		case ALGO_MONERO:
 			opt_algo = ALGO_CRYPTONIGHT;
 			cryptonight_fork = 7;
 			break;
 		case ALGO_GRAFT:
 			opt_algo = ALGO_CRYPTONIGHT;
 			cryptonight_fork = 8;
 			break;
 		case ALGO_STELLITE:
 			opt_algo = ALGO_CRYPTONIGHT;
 			cryptonight_fork = 3;
 			break;
 		case ALGO_CRYPTONIGHT:
 			cryptonight_fork = 1;
 			break;
 		}
 		break;
 	case 'b':
 		p = strstr(arg, ":");
--- a/crypto/cn_aes.cuh
+++ b/crypto/cn_aes.cuh
@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
 */
 #define AS_U32(addr) *((uint32_t*)(addr))
 #define AS_U64(addr) *((uint64_t*)(addr))
 #define AS_UINT2(addr) *((uint2*)(addr))
 #define AS_UINT4(addr) *((uint4*)(addr))
 #define AS_UL2(addr) *((ulonglong2*)(addr))
--- a/crypto/cn_blake.cuh
+++ b/crypto/cn_blake.cuh
@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
 }
 __device__
-void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
+void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out)
 {
 	blake_state bs;
 	blake_state *S = (blake_state *)&bs;
--- a/crypto/cn_groestl.cuh
+++ b/crypto/cn_groestl.cuh
@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState*  __restrict__ ctx, BitSequence* __restri
 	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
 		output[j] = s[i];
 	}
-
+#if 0
 	for (i = 0; i < GROESTL_COLS512; i++) {
 		ctx->chaining[i] = 0;
 	}
 	for (i = 0; i < GROESTL_SIZE512; i++) {
 		ctx->buffer[i] = 0;
 	}
 #endif
 }
 __device__
@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx)
 }
 __device__
-void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	DataLength databitlen = len << 3;
 	groestlHashState context;
 	cn_groestl_init(&context);
-	cn_groestl_update(&context, data, databitlen);
+	cn_groestl_update(&context, (BitSequence*) data, databitlen);
-	cn_groestl_final(&context, hashval);
+	cn_groestl_final(&context, (BitSequence*) hashval);
 }
--- a/crypto/cn_jh.cuh
+++ b/crypto/cn_jh.cuh
@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
 		databitlen = 0;
 	}
-	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  ) {
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  )
-		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
+	{
 		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
 		index = 64-(state->datasize_in_buffer >> 3);
 		databitlen = databitlen - (512 - state->datasize_in_buffer);
 		cn_jh_F8(state);
@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
 /* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
 __device__
-void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval)
+void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval)
 {
 	unsigned int i;
 	//uint32_t *bufptr = (uint32_t *)state->buffer;
@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
 	} else {
-		/*set the rest of the bytes in the buffer to 0*/
+		/* set the rest of the bytes in the buffer to 0 */
 		if ( (state->datasize_in_buffer & 7) == 0) {
 			for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
 		} else {
@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
 		cn_jh_F8(state);
 	}
-	MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
+	memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32);
 	//MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
 }
 __device__
@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen)
 	state->databitlen = 0;
 	state->datasize_in_buffer = 0;
 	state->hashbitlen = hashbitlen;
-	//memcpy(state->x, d_JH256_H0, 128);
+	memcpy(state->x, d_JH256_H0, 128);
-	MEMCPY8(state->x, d_JH256_H0, 128 / 8);
+	//MEMCPY8(state->x, d_JH256_H0, 128 / 8);
 }
 __device__
-void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
+void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	const int hashbitlen = 256;
 	DataLength databitlen = len << 3;
@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
 	cn_jh_init(&state, hashbitlen);
 	cn_jh_update(&state, data, databitlen);
-	cn_jh_final(&state, hashval);
+	cn_jh_final(&state, (uint8_t*) hashval);
 }
--- a/crypto/cn_keccak.cuh
+++ b/crypto/cn_keccak.cuh
@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s)
 }
 __device__ __forceinline__
-void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md)
+void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md)
 {
 	uint64_t st[25];
--- a/crypto/cn_skein.cuh
+++ b/crypto/cn_skein.cuh
@ -4,19 +4,15 @@ typedef unsigned int    uint_t;             /* native unsigned integer */
 #define SKEIN_256_STATE_WORDS ( 4)
 #define SKEIN_512_STATE_WORDS ( 8)
 #define SKEIN1024_STATE_WORDS (16)
 #define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
 #define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
 #define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
 #define SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)
 #define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
 #define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
 #define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
 #define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
@ -119,7 +115,7 @@ typedef struct {
 } skeinHashState;
 __device__
-void cn_skein256_init(skeinHashState *state, size_t hashBitLen)
+void cn_skein_init(skeinHashState *state, size_t hashBitLen)
 {
 	const uint64_t SKEIN_512_IV_256[] =
 	{
@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr
 }
 __device__
-void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
+void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
 {
 	if ((databitlen & 7) == 0) {
 		cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
 	}
 	else {
 		size_t bCnt = (databitlen >> 3) + 1;
 		uint8_t b,mask;
@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r
 }
 __device__
-void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal)
+void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
 {
 	uint64_t X[SKEIN_512_STATE_WORDS];
 	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric
 		((uint64_t *)ctx->b)[0] = (uint64_t)i;
 		Skein_Start_New_Type(ctx, OUT_FINAL);
 		cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
-		memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n);
+		memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n);
 		memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
 	}
 }
 __device__
-void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
+void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	int hashbitlen = 256;
 	DataLength databitlen = len << 3;
@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
 	state.statebits = 64*SKEIN_512_STATE_WORDS;
-	cn_skein256_init(&state, hashbitlen);
+	cn_skein_init(&state, hashbitlen);
-	cn_skein256_update(&state, data, databitlen);
+	cn_skein_update(&state, data, databitlen);
-	cn_skein256_final(&state, hashval);
+	cn_skein_final(&state, (uint8_t*) hashval);
 }
--- a/crypto/cryptolight-core.cu
+++ b/crypto/cryptolight-core.cu
@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
 	if(thread < threads)
 	{
-		const int oft = thread * 52 + sub + 16; // not aligned 16!
+		const int oft = thread * 50 + sub + 16; // not aligned 16!
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];
@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
 	}
 }
 // --------------------------------------------------------------------------------------------------------------
 __global__
-void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
 {
 	__shared__ uint32_t __align__(16) sharedMemory[1024];
@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
 #endif // __CUDA_ARCH__ >= 300
 }
 __device__ __forceinline__ void store_variant1(uint32_t* long_state)
 {
 	uint4* Z = (uint4*) long_state;
 	const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
 	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
 	Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
 }
 #define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
        uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
        hi += ((uint64_t *)c)[0]; \
        ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
        ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
        ((uint64_t *)dst)[0] = hi; \
        ((uint64_t *)dst)[1] = lo ^ tweak; }
 __global__
 void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
 	uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
 	uint64_t * __restrict__ d_tweak)
 {
 	__shared__ __align__(16) uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
 	__syncthreads();
 	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
 	if (thread < threads)
 	{
 		const uint32_t batchsize = ITER >> (2 + bfactor);
 		const uint32_t start = partidx * batchsize;
 		const uint32_t end = start + batchsize;
 		const uint32_t longptr = thread << LONG_SHL_IDX;
 		uint32_t * long_state = &d_long_state[longptr];
 		uint64_t tweak = d_tweak[thread];
 		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
 		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
 		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
 		uint4 B = AS_UINT4(ctx_b);
 		uint32_t* a = (uint32_t*)&A;
 		uint32_t* b = (uint32_t*)&B;
 		for (int i = start; i < end; i++)
 		{
 			uint32_t c[4];
 			uint32_t j = (A.x >> 2) & E2I_MASK2;
 			cn_aes_single_round(sharedMemory, &long_state[j], c, a);
 			XOR_BLOCKS_DST(c, b, &long_state[j]);
 			store_variant1(&long_state[j]);
 			MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
 			j = (A.x >> 2) & E2I_MASK2;
 			cn_aes_single_round(sharedMemory, &long_state[j], b, a);
 			XOR_BLOCKS_DST(b, c, &long_state[j]);
 			store_variant1(&long_state[j]);
 			MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
 		}
 		if (bfactor) {
 			AS_UINT4(ctx_a) = A;
 			AS_UINT4(ctx_b) = B;
 		}
 	}
 }
 __global__
 void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
 {
@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
 	if(thread < threads)
 	{
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
-		const int oft = thread * 52 + sub + 16;
+		const int oft = thread * 50 + sub + 16;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];
@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
 extern int device_bfactor[MAX_GPUS];
 __host__
-void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state,
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
-	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
 	int i, partcount = 1 << bfactor;
 	int dev_id = device_map[thr_id];
-	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1);
+	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	if(partcount > 1) usleep(bsleep);
 	for(i = 0; i < partcount; i++)
 	{
-		cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
 		if (variant == 0)
 			cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
 		else
 			cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}
-	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2);
+	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
--- a/crypto/cryptolight-cpu.cpp
+++ b/crypto/cryptolight-cpu.cpp
@ -22,6 +22,16 @@ struct cryptonight_ctx {
 	oaes_ctx* aes_ctx;
 };
 static void cryptolight_store_variant(void* state, int variant) {
 	if (variant == 1) {
 		// use variant 1 like monero since june 2018
 		const uint8_t tmp = ((const uint8_t*)(state))[11];
 		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
 		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
 	}
 }
 static void do_blake_hash(const void* input, int len, void* output)
 {
 	uchar hash[32];
@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
 }
-static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];
 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
 }
 static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }
-static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx)
+static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
 	if (variant && len < 43)
 		return 0;
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 	const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
 #undef RND
@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
 		cryptolight_store_variant(&ctx->long_state[j], variant);
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
 		cryptolight_store_variant(&ctx->long_state[j], variant);
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
 	}
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
 	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
 	return 1;
 }
-void cryptolight_hash(void* output, const void* input, int len)
+int cryptolight_hash_variant(void* output, const void* input, int len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	cryptolight_hash_ctx(output, input, len, ctx);
+	int rc = cryptolight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
 	return rc;
 }
 void cryptolight_hash(void* output, const void* input)
 {
 	cryptolight_hash_variant(output, input, 76, 1);
 }
--- a/crypto/cryptolight.cu
+++ b/crypto/cryptolight.cu
@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks  = 32;
 static __thread uint32_t cn_threads = 16;
 static uint32_t *d_long_state[MAX_GPUS];
-static uint64_t *d_ctx_state[MAX_GPUS];
+static uint32_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
 static uint64_t *d_ctx_tweak[MAX_GPUS];
 static uint32_t *d_ctx_a[MAX_GPUS];
 static uint32_t *d_ctx_b[MAX_GPUS];
 static bool init[MAX_GPUS] = { 0 };
-extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
 {
 	int res = 0;
 	uint32_t throughput = 0;
@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 	uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
 	const uint32_t first_nonce = *nonceptr;
 	uint32_t nonce = first_nonce;
 	int dev_id = device_map[thr_id];
 	if(opt_benchmark) {
 		ptarget[7] = 0x00ff;
@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 	if(!init[thr_id])
 	{
 		if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
 			device_config[thr_id] = strdup("80x32");
 		}
 		if (device_config[thr_id]) {
 			sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		}
 		const size_t alloc = MEMORY * throughput;
-		cryptonight_extra_cpu_init(thr_id, throughput);
+		cryptonight_extra_init(thr_id);
 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
+		cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
 		init[thr_id] = true;
 	}
@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		const uint32_t Htarg = ptarget[7];
 		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
-		cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
+		cryptonight_extra_setData(thr_id, pdata, ptarget);
-		cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
-		cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
+		cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
-		cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
+		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
 		*hashes_done = nonce - first_nonce + throughput;
@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			cryptolight_hash(vhash, tempdata, 76);
+			cryptolight_hash_variant(vhash, tempdata, 76, variant);
 			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
 				res = 1;
@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 				if(resNonces[1] != UINT32_MAX)
 				{
 					*tempnonceptr = resNonces[1];
-					cryptolight_hash(vhash, tempdata, 76);
+					cryptolight_hash_variant(vhash, tempdata, 76, variant);
 					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 						res++;
 						work->nonces[1] = resNonces[1];
@ -157,10 +164,11 @@ void free_cryptolight(int thr_id)
 	cudaFree(d_ctx_key1[thr_id]);
 	cudaFree(d_ctx_key2[thr_id]);
 	cudaFree(d_ctx_text[thr_id]);
 	cudaFree(d_ctx_tweak[thr_id]);
 	cudaFree(d_ctx_a[thr_id]);
 	cudaFree(d_ctx_b[thr_id]);
-	cryptonight_extra_cpu_free(thr_id);
+	cryptonight_extra_free(thr_id);
 	cudaDeviceSynchronize();
--- a/crypto/cryptolight.h
+++ b/crypto/cryptolight.h
@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 		exit(1);
 	}
 }
 void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
+
-void cryptonight_extra_cpu_free(int thr_id);
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
+void cryptonight_extra_free(int thr_id);
 void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
 void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);
--- a/crypto/cryptonight-core.cu
+++ b/crypto/cryptonight-core.cu
@ -2,47 +2,55 @@
 #include <stdint.h>
 #include <string.h>
 #include <sys/time.h>
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <cuda.h>
 #include <cuda_runtime.h>
 #if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
 #undef __shfl
 #define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
 #endif
 #include "cryptonight.h"
-#define LONG_SHL32 19 // 1<<19
+#define LONG_SHL32 19 // 1<<19 (uint32_t* index)
 #define LONG_SHL64 18 // 1<<18 (uint64_t* index)
 #define LONG_LOOPS32 0x80000U
 #define LONG_LOOPS64 0x40000U
 #include "cn_aes.cuh"
 __global__
-//__launch_bounds__(128, 9) // 56 registers
+void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state,
-void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1)
+	uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
 {
-	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	__shared__ uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
 	__syncthreads();
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
 	const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14
 	if(thread < threads)
 	{
-		const uint32_t long_oft = (thread << LONG_SHL64) + sub;
+		cn_aes_gpu_init(sharedMemory);
-
+		__syncthreads();
 		const uint32_t* ctx_key = &ctx_key1[thread * 40U];
 		uint4 keys[10];
 		#pragma unroll 10 // load 160 bytes
 		for (int i = 0; i < 10; i ++)
 			keys[i] = AS_UINT4(&ctx_key[i*4]);
-		uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]);
+		const uint32_t sub = (threadIdx.x & 0x7U) << 2;
 		uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
 		uint32_t __align__(8) key[40];
 		MEMCPY8(key, &ctx_key1[thread * 40U], 20);
 		uint32_t __align__(8) text[4];
 		MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2);
-		for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) {
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
-			cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys);
+		{
-			AS_UINT4(&long_state[long_oft + i]) = text;
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
 			MEMCPY8(&longstate[i], text, 2);
 		}
 	}
 }
 // --------------------------------------------------------------------------------------------------------------
 __device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
 {
 	ulonglong2 product;
@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co
 	return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
 }
-#undef MUL_SUM_XOR_DST
+__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst)
 __device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst)
 {
 	ulonglong2 d = AS_UL2(far_dst);
 	ulonglong2 p = cuda_mul128(m, d.x);
@ -73,8 +80,8 @@ __global__
 #if __CUDA_ARCH__ >= 500
 //__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
 #endif
-void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
+void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
-	uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
 {
 	__shared__ __align__(16) uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
 	if (thread < threads)
 	{
-		const uint32_t batchsize = ITER >> (2U + bfactor);
+		const uint32_t batchsize = ITER >> (2 + bfactor);
 		const uint32_t start = partidx * batchsize;
 		const uint32_t end = start + batchsize;
@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
 			uint32_t j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
 			AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4
-			MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
+			MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
 			j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
 			AS_UINT4(&long_state[j]) = C ^ B;
-			MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
+			MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
 		}
 		if (bfactor) {
@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
 	}
 }
 // --------------------------------------------------------------------------------------------------------------
 __device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
 {
 	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
 	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
 	Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
 	AS_UINT4(long_state) = Z;
 }
 __device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z)
 {
 	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
 	const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1;
 	Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24);
 	AS_UINT4(long_state) = Z;
 }
 __device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak)
 {
 	ulonglong2 d = AS_UL2(far_dst);
 	ulonglong2 p = cuda_mul128(m, d.x);
 	p += AS_UL2(&a);
 	AS_UL2(&a) = p ^ d;
 	p.y = p.y ^ tweak;
 	AS_UL2(far_dst) = p;
 }
 __global__
-void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2)
+void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
 	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
 	uint64_t * __restrict__ d_tweak)
 {
 	__shared__ __align__(16) uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
 	__syncthreads();
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U;
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
-	const uint32_t sub = (threadIdx.x & 7U) << 1U;
+	if (thread < threads)
 	{
 		const uint32_t batchsize = ITER >> (2 + bfactor);
 		const uint32_t start = partidx * batchsize;
 		const uint32_t end = start + batchsize;
 		uint64_t tweak = d_tweak[thread];
 		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
 		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
 		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
 		uint4 B = AS_UINT4(ctx_b);
-	if(thread < threads)
+		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
 		for (int i = start; i < end; i++) // end = 262144
 		{
 			uint4 C;
 			uint32_t j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
 			store_variant1(&long_state[j], C ^ B); // st.global
 			MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
 			j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
 			store_variant1(&long_state[j], C ^ B);
 			MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
 		}
 		if (bfactor) {
 			AS_UINT4(ctx_a) = A;
 			AS_UINT4(ctx_b) = B;
 		}
 	}
 }
 // --------------------------------------------------------------------------------------------------------------
 __global__
 void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
 	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
 	uint64_t * __restrict__ d_tweak)
 {
 	__shared__ __align__(16) uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
 	__syncthreads();
 	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
 	if (thread < threads)
 	{
-		const uint32_t long_oft = (thread << LONG_SHL64) + sub;
+		const uint32_t batchsize = ITER >> (2 + bfactor);
-		const uint32_t st_oft = (thread * 26U) + sub + 8U;
+		const uint32_t start = partidx * batchsize;
 		const uint32_t end = start + batchsize;
 		uint64_t tweak = d_tweak[thread];
 		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
 		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
 		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
 		uint4 B = AS_UINT4(ctx_b);
 		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
 		for (int i = start; i < end; i++) // end = 262144
 		{
 			uint4 C;
 			uint32_t j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
 			store_variant2(&long_state[j], C ^ B); // st.global
 			MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
 			j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
 			store_variant2(&long_state[j], C ^ B);
 			MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
 		}
 		if (bfactor) {
 			AS_UINT4(ctx_a) = A;
 			AS_UINT4(ctx_b) = B;
 		}
 	}
 }
 // --------------------------------------------------------------------------------------------------------------
-		uint4 key[10];
+__global__
-		const uint32_t* ctx_key = &ctx_key2[thread * 40U];
+void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state,
-		#pragma unroll 10 // 160 bytes
+	uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2)
-		for (int i = 0; i < 10; i++)
+{
-			key[i] = AS_UINT4(&ctx_key[i*4U]);
+	__shared__ uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
 	__syncthreads();
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-		uint4 text = AS_UINT4(&ctx_state[st_oft]);
+	if(thread < threads)
 	{
 		const int sub = (threadIdx.x & 7) << 2;
 		const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
 		uint32_t key[40], text[4];
 		MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
 		MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
-		for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U)
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
 		{
-			uint4 st = AS_UINT4(&long_state[long_oft + i]);
+			#pragma unroll
-			text = text ^ st;
+			for(int j = 0; j < 4; ++j)
-			cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key);
+				text[j] ^= longstate[i + j];
 			cn_aes_pseudo_round_mut(sharedMemory, text, key);
 		}
-		AS_UINT4(&ctx_state[st_oft]) = text;
+		MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
 	}
 }
 // --------------------------------------------------------------------------------------------------------------
 extern int device_bfactor[MAX_GPUS];
 __host__
-void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state,
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
-	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
 	//dim3 block2(threads << 1);
 	dim3 block4(threads << 2);
 	dim3 block8(threads << 3);
-	const uint32_t bfactor = (uint32_t) device_bfactor[thr_id];
+	const uint16_t bfactor = (uint16_t) device_bfactor[thr_id];
-	const uint32_t partcount = 1 << bfactor;
+	const uint32_t partcount = 1U << bfactor;
 	const uint32_t throughput = (uint32_t) (blocks*threads);
 	const int bsleep = bfactor ? 100 : 0;
 	const int dev_id = device_map[thr_id];
-	cryptonight_core_gpu_phase1 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key1);
+	cryptonight_gpu_phase1 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	if(partcount > 1) usleep(bsleep);
 	for (uint32_t i = 0; i < partcount; i++)
 	{
 		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
-		cryptonight_core_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		if (variant == 0)
 			cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
 		else if (variant == 1 || cryptonight_fork == 8)
 			monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		else if (variant == 2 && cryptonight_fork == 3)
 			stellite_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}
-
+	//cudaDeviceSynchronize();
-	cryptonight_core_gpu_phase3 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key2);
+	//exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	cryptonight_gpu_phase3 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
--- a/crypto/cryptonight-cpu.cpp
+++ b/crypto/cryptonight-cpu.cpp
@ -12,6 +12,20 @@ extern "C" {
 #include "cpu/c_keccak.h"
 }
 static void cryptonight_store_variant(void* state, int variant) {
 	if (variant == 1 || cryptonight_fork == 8) {
 		// monero, and graft ?
 		const uint8_t tmp = ((const uint8_t*)(state))[11];
 		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
 		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
 	} else if (variant == 2 && cryptonight_fork == 3) {
 		// stellite
 		const uint8_t tmp = ((const uint8_t*)(state))[11];
 		const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
 		((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
 	}
 }
 struct cryptonight_ctx {
 	uint8_t long_state[MEMORY];
 	union cn_slow_hash_state state;
@ -130,14 +144,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
 }
-static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];
 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo;
 }
 static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -155,13 +169,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }
-static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx)
+static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
 	if (variant && len < 43)
 		return 0;
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 	const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
 #undef RND
@ -184,14 +203,16 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
 		j = e2i(ctx->a) * AES_BLOCK_SIZE;
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
 		cryptonight_store_variant(&ctx->long_state[j], variant);
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2);
 		j = e2i(ctx->a) * AES_BLOCK_SIZE;
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
 		cryptonight_store_variant(&ctx->long_state[j], variant);
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2);
 	}
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@ -217,11 +238,38 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
 	if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
 	return 1;
 }
-void cryptonight_hash(void* output, const void* input, size_t len)
+int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	cryptonight_hash_ctx(output, input, len, ctx);
+	int rc = cryptonight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
 	return rc;
 }
 void cryptonight_hash(void* output, const void* input)
 {
 	cryptonight_fork = 1;
 	cryptonight_hash_variant(output, input, 76, 0);
 }
 void graft_hash(void* output, const void* input)
 {
 	cryptonight_fork = 8;
 	cryptonight_hash_variant(output, input, 76, 1);
 }
 void monero_hash(void* output, const void* input)
 {
 	cryptonight_fork = 7;
 	cryptonight_hash_variant(output, input, 76, 1);
 }
 void stellite_hash(void* output, const void* input)
 {
 	cryptonight_fork = 3;
 	cryptonight_hash_variant(output, input, 76, 2);
 }
--- a/crypto/cryptonight-extra.cu
+++ b/crypto/cryptonight-extra.cu
@ -7,15 +7,15 @@
 #include <miner.h>
 #include <cuda_helper.h>
 #include "cryptonight.h"
-typedef uint8_t BitSequence;
+#include "cryptonight.h"
 typedef uint64_t DataLength;
-static uint32_t *d_input[MAX_GPUS] = { 0 };
+static uint32_t *d_input[MAX_GPUS];
 static uint32_t *d_target[MAX_GPUS];
 static uint32_t *d_result[MAX_GPUS];
 typedef uint8_t BitSequence;
 typedef uint32_t DataLength;
 #include "cn_keccak.cuh"
 #include "cn_blake.cuh"
 #include "cn_groestl.cuh"
@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = {
 __device__ __forceinline__
 void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
 {
-	const uint32_t aes_gf[] = {
+	const uint32_t aes_gf[10] = {
 		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
 	};
 	MEMSET4(key, 0, 40);
 	MEMCPY4(key, data, 8);
 	#pragma unroll
 	for(int i = 8; i < 40; i++)
 	{
@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
 }
 __global__
-void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
+void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce,
-	uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
-	uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2)
+	uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if(thread < threads)
 	{
-		uint32_t ctx_state[50];
+		uint64_t ctx_state[25];
 		uint32_t ctx_a[4];
 		uint32_t ctx_b[4];
 		uint32_t ctx_key1[40];
@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
 		uint32_t input[19];
 		MEMCPY4(input, d_input, 19);
 		*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
 		cn_keccak((uint8_t *)input, (uint8_t *)ctx_state);
 		cryptonight_aes_set_key(ctx_key1, ctx_state);
 		cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
 		XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
 		XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
 		MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25);
 		MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4);
 		MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4);
 		MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40);
 		MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40);
 	}
 }
-__global__
+		uint32_t nonce = startNonce + thread;
-void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state)
+		*(((uint8_t *)input) + 39) = nonce & 0xff;
-{
+		*(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff;
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+		*(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff;
-	if(thread < threads)
+		*(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff;
-	{
+
-		uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]);
+		cn_keccak(input, ctx_state);
-		uint64_t state[25];
+		MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50);
-		#pragma unroll
+
-		for(int i = 0; i < 25; i++)
+		cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0]));
-			state[i] = ctx_state[i];
+		cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4]));
-
+		MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40);
-		cn_keccakf2(state);
+		MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40);
-
+
-		// to reduce the final kernel stack frame, cut algos in 2 kernels
+		XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a);
-		// ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%)
+		XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b);
-		switch (((uint8_t*)state)[0] & 0x03)
+		MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4);
-		{
+		MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4);
-			case 0: {
+
-				uint32_t hash[8];
+		if (variant) {
-				cn_blake((uint8_t*)state, 200, (uint8_t*)hash);
+			uint2 tweak = AS_UINT2(&ctx_state[24]);
-				((uint32_t*)ctx_state)[0] = 0;
+			//tweak.x ^= (input[8] >> 24) | (input[9] << 8);
-				((uint32_t*)ctx_state)[6] = hash[6];
+			tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543);
-				((uint32_t*)ctx_state)[7] = hash[7];
+			tweak.y ^= __byte_perm(input[9], input[10], 0x6543);
-				break;
+			MEMCPY4(&d_ctx_tweak[thread], &tweak, 2);
 			}
 			case 1: {
 				uint32_t hash[8];
 				cn_groestl((BitSequence*)state, 200, (BitSequence*)hash);
 				((uint32_t*)ctx_state)[0] = 0;
 				((uint32_t*)ctx_state)[6] = hash[6];
 				((uint32_t*)ctx_state)[7] = hash[7];
 				break;
 			}
 			default: {
 				#pragma unroll
 				for(int i = 0; i < 25; i++)
 					ctx_state[i] = state[i];
 			}
 		}
 	}
 }
 __global__
-void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state,
+void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target,
-	const uint32_t* d_target, uint32_t * resNonces)
+	uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state)
 {
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
 	if(thread < threads)
 	{
-		uint64_t* const state = &d_ctx_state[thread * 26U];
+		uint32_t *ctx_state = &d_ctx_state[thread * 50U];
 		uint32_t hash[8];
-		switch(((uint8_t *)state)[0] & 0x03)
+		uint32_t state[50];
-		{
+
-			case 0: {
+		#pragma unroll 25
-				uint32_t* h32 = (uint32_t*)state;
+		for(int i = 0; i < 50; i+=2)
-				hash[6] = h32[6];
+			AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]);
-				hash[7] = h32[7];
+
-				break;
+		cn_keccakf2((uint64_t *)state);
-			}
+
-			case 2: {
+		int branch = ((uint8_t *)state)[0] & 0x03;
-				cn_jh256((uint8_t*)state, 200, hash);
+		if(branch == 0)
-				break;
+			cn_blake((const uint8_t *)state, 200, hash);
-			}
+		if(branch == 1)
-			case 3: {
+			cn_groestl((const uint8_t *)state, 200, hash);
-				cn_skein((uint8_t*)state, 200, hash);
+		if(branch == 2)
-				break;
+			cn_jh((const uint8_t *)state, 200, hash);
-			}
+		if(branch == 3)
-		}
+			cn_skein((const uint8_t *)state, 200, hash);
 		if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
 		{
@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
 }
 __host__
-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget)
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget)
 {
 	uint32_t *pTargetIn = (uint32_t*) ptarget;
-	cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice);
-	cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice);
 	cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t));
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
 __host__
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
+void cryptonight_extra_init(int thr_id)
 {
-	cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t));
+	cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t));
-	cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t));
+	cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t));
-	cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t));
+	cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t));
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
 __host__
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
-	cryptonight_extra_gpu_prepare <<<grid, block >>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2);
+	cryptonight_extra_gpu_prepare <<<grid, block>>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
 __host__
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state)
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state)
 {
 	uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, (uint32_t*)d_ctx_state);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]);
+	cryptonight_extra_gpu_final <<<grid, block>>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
 __host__
-void cryptonight_extra_cpu_free(int thr_id)
+void cryptonight_extra_free(int thr_id)
 {
 	if (d_input[thr_id]) {
 		cudaFree(d_input[thr_id]);
@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id)
 		cudaFree(d_result[thr_id]);
 		d_input[thr_id] = NULL;
 	}
-}
+}
--- a/crypto/cryptonight.cu
+++ b/crypto/cryptonight.cu
@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false;
 	gpulog(p, thr, fmt, ##__VA_ARGS__)
 static uint64_t *d_long_state[MAX_GPUS];
-static uint64_t *d_ctx_state[MAX_GPUS];
+static uint32_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
 static uint64_t *d_ctx_tweak[MAX_GPUS];
 static uint32_t *d_ctx_a[MAX_GPUS];
 static uint32_t *d_ctx_b[MAX_GPUS];
 static bool init[MAX_GPUS] = { 0 };
-extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
 {
 	int res = 0;
 	uint32_t throughput = 0;
@ -49,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
 				mem, device_mpcount[dev_id]);
 		if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
 			device_config[thr_id] = strdup("80x24");
 		}
 		if (device_config[thr_id]) {
 			int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -70,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			exit(1);
 		}
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@ -79,11 +84,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		}
 		const size_t alloc = MEMORY * throughput;
-		cryptonight_extra_cpu_init(thr_id, throughput);
+		cryptonight_extra_init(thr_id);
 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16)
+		cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -95,6 +100,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
 		exit_if_cudaerror(thr_id, __FILE__, __LINE__);
 		gpu_init_shown = true;
 		init[thr_id] = true;
@ -107,10 +114,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		const uint32_t Htarg = ptarget[7];
 		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
-		cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
+		cryptonight_extra_setData(thr_id, pdata, ptarget);
-		cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
-		cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
+		cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
-		cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
+		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
 		*hashes_done = nonce - first_nonce + throughput;
@ -121,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			cryptonight_hash(vhash, tempdata, 76);
+			const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
-			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget))
 			{
 				res = 1;
 				work->nonces[0] = resNonces[0];
@ -131,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 				if(resNonces[1] != UINT32_MAX)
 				{
 					*tempnonceptr = resNonces[1];
-					cryptonight_hash(vhash, tempdata, 76);
+					const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
-					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+					if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) {
 						res++;
 						work->nonces[1] = resNonces[1];
 					} else {
@ -174,10 +181,11 @@ void free_cryptonight(int thr_id)
 	cudaFree(d_ctx_key1[thr_id]);
 	cudaFree(d_ctx_key2[thr_id]);
 	cudaFree(d_ctx_text[thr_id]);
 	cudaFree(d_ctx_tweak[thr_id]);
 	cudaFree(d_ctx_a[thr_id]);
 	cudaFree(d_ctx_b[thr_id]);
-	cryptonight_extra_cpu_free(thr_id);
+	cryptonight_extra_free(thr_id);
 	cudaDeviceSynchronize();
--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@ -20,7 +20,6 @@ struct uint3  blockDim;
 #define __umul64hi(a,b) a*b
 #endif
 #define MEMORY         (1U << 21) // 2 MiB / 2097152 B
 #define ITER           (1U << 20) // 1048576
 #define E2I_MASK       0x1FFFF0u
@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 		exit(1);
 	}
 }
-void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
+void cryptonight_extra_init(int thr_id);
-void cryptonight_extra_cpu_free(int thr_id);
+void cryptonight_extra_free(int thr_id);
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);
--- a/crypto/xmr-rpc.cpp
+++ b/crypto/xmr-rpc.cpp
@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
 	}
 	else if (opt_algo == ALGO_CRYPTOLIGHT) {
 		int variant = 1;
 		uint32_t nonce = work->nonces[idnonce];
 		noncestr = bin2hex((unsigned char*) &nonce, 4);
 		last_found_nonce = nonce;
-		cryptolight_hash(hash, data, 76);
+		//if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
 		//	variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
 		cryptolight_hash_variant(hash, data, 76, variant);
 		work_set_target_ratio(work, (uint32_t*) hash);
 	}
 	else if (opt_algo == ALGO_CRYPTONIGHT) {
 		int variant = 0;
 		uint32_t nonce = work->nonces[idnonce];
 		noncestr = bin2hex((unsigned char*) &nonce, 4);
 		last_found_nonce = nonce;
-		cryptonight_hash(hash, data, 76);
+		if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
 			variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
 		cryptonight_hash_variant(hash, data, 76, variant);
 		work_set_target_ratio(work, (uint32_t*) hash);
 	}
--- a/miner.h
+++ b/miner.h
@ -279,8 +279,8 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce,
 extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
-extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
 extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -575,6 +575,8 @@ extern uint32_t device_plimit[MAX_GPUS];
 extern uint32_t gpus_intensity[MAX_GPUS];
 extern int opt_cudaschedule;
 extern int cryptonight_fork;
 // cuda.cpp
 int cuda_num_devices();
 void cuda_devicenames();
@ -898,8 +900,12 @@ void blake2b_hash(void *output, const void *input);
 void blake2s_hash(void *output, const void *input);
 void bmw_hash(void *state, const void *input);
 void c11hash(void *output, const void *input);
-void cryptolight_hash(void* output, const void* input, int len);
+int cryptolight_hash_variant(void* output, const void* input, int len, int variant);
-void cryptonight_hash(void* output, const void* input, size_t len);
+void cryptolight_hash(void* output, const void* input);
 int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
 void cryptonight_hash(void* output, const void* input);
 void monero_hash(void* output, const void* input);
 void stellite_hash(void* output, const void* input);
 void decred_hash(void *state, const void *input);
 void deephash(void *state, const void *input);
 void luffa_hash(void *state, const void *input);
--- a/util.cpp
+++ b/util.cpp
@ -2193,10 +2193,10 @@ void print_hash_tests(void)
 	c11hash(&hash[0], &buf[0]);
 	printpfx("c11", hash);
-	cryptolight_hash(&hash[0], &buf[0], 76);
+	cryptolight_hash(&hash[0], &buf[0]);
 	printpfx("cryptolight", hash);
-	cryptonight_hash(&hash[0], &buf[0], 76);
+	cryptonight_hash(&hash[0], &buf[0]);
 	printpfx("cryptonight", hash);
 	memset(buf, 0, 180);
@ -2246,6 +2246,9 @@ void print_hash_tests(void)
 	lyra2Z_hash(&hash[0], &buf[0]);
 	printpfx("lyra2z", hash);
 	monero_hash(&hash[0], &buf[0]);
 	printpfx("monero", hash);
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);
@ -2297,6 +2300,9 @@ void print_hash_tests(void)
 	skunk_hash(&hash[0], &buf[0]);
 	printpfx("skunk", hash);
 	stellite_hash(&hash[0], &buf[0]);
 	printpfx("stelitte", hash);
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);