xmr: vectors rewrite, now the phase2 is using only 40 regs

no more constant memory used for aes. tested only on linux cuda 8 for now... wip
8 years ago · bd030db5d1
8 changed files with 1089 additions and 255 deletions
--- a/crypto/cn_aes.cuh
+++ b/crypto/cn_aes.cuh
--- a/crypto/cryptolight-core.cu
+++ b/crypto/cryptolight-core.cu
@ -8,8 +8,6 @@
				@@ -8,8 +8,6 @@
 #define LONG_SHL_IDX 18
 #define LONG_LOOPS32 0x40000

-extern int device_backoff[MAX_GPUS];
-
 #include "cn_aes.cuh"

 #define MUL_SUM_XOR_DST(a,c,dst) { \
@ -27,7 +25,7 @@ __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t mu
				@@ -27,7 +25,7 @@ __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t mu
 }

 __global__
-void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
+void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key1)
 {
 	__shared__ uint32_t __align__(16) sharedMemory[1024];

@ -38,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state
				@@ -38,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state

 	if(thread < threads)
 	{
-		const int oft = thread * 50 + sub + 16; // not aligned 16!
+		const int oft = thread * 52 + sub + 16; // not aligned 16!
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];
@ -212,7 +210,7 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
				@@ -212,7 +210,7 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
 }

 __global__
-void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key2)
+void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
 {
 	__shared__ uint32_t __align__(16) sharedMemory[1024];

@ -224,9 +222,9 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long
				@@ -224,9 +222,9 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long
 	if(thread < threads)
 	{
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
-		const int oft = thread * 50 + sub + 16;
+		const int oft = thread * 52 + sub + 16;
 		uint32_t __align__(16) key[40];
-		uint32_t __align__(8) text[4];
+		uint32_t __align__(16) text[4];

 		#pragma unroll
 		for (int i = 0; i < 40; i += 4)
@ -253,7 +251,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long
				@@ -253,7 +251,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long
 extern int device_bfactor[MAX_GPUS];

 __host__
-void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state,
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
@ -266,7 +265,7 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
				@@ -266,7 +265,7 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
 	int i, partcount = 1 << bfactor;
 	int dev_id = device_map[thr_id];

-	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
+	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	if(partcount > 1) usleep(bsleep);

@ -277,6 +276,6 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
				@@ -277,6 +276,6 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
 		if(partcount > 1) usleep(bsleep);
 	}

-	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
+	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
--- a/crypto/cryptolight.cu
+++ b/crypto/cryptolight.cu
@ -7,7 +7,7 @@ static __thread uint32_t cn_blocks  = 32;
				@@ -7,7 +7,7 @@ static __thread uint32_t cn_blocks  = 32;
 static __thread uint32_t cn_threads = 16;

 static uint32_t *d_long_state[MAX_GPUS];
-static uint32_t *d_ctx_state[MAX_GPUS];
+static uint64_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
@ -67,7 +67,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
				@@ -67,7 +67,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_

 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
+		cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
--- a/crypto/cryptolight.h
+++ b/crypto/cryptolight.h
@ -133,10 +133,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
				@@ -133,10 +133,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 		exit(1);
 	}
 }
-void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);

 void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
 void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
 void cryptonight_extra_cpu_free(int thr_id);
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint32_t *d_ctx_state);
+void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
--- a/crypto/cryptonight-core.cu
+++ b/crypto/cryptonight-core.cu
@ -5,280 +5,189 @@
				@@ -5,280 +5,189 @@
 #include <unistd.h>

 #include "cryptonight.h"
-#define LONG_SHL_IDX 19
-#define LONG_LOOPS32 0x80000
+#define LONG_SHL_IDX 19U
+#define LONG_LOOPS32 0x80000U

 #include "cn_aes.cuh"

-#define MUL_SUM_XOR_DST(a,c,dst) { \
-	uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
-	hi += ((uint64_t *)c)[0]; \
-	((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
-	((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
-	((uint64_t *)dst)[0] = hi; \
-	((uint64_t *)dst)[1] = lo; }
-
-__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
-{
-	*product_hi = __umul64hi(multiplier, multiplicand);
-	return(multiplier * multiplicand);
-}
-
 __global__
-void cryptonight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
+//__launch_bounds__(128, 9) // 56 registers
+void cryptonight_core_gpu_phase1(const uint32_t threads, uint32_t * long_state, uint32_t * const ctx_state, uint32_t * ctx_key1)
 {
-	__shared__ uint32_t __align__(16) sharedMemory[1024];
+	__shared__ __align__(16) uint32_t sharedMemory[1024];

 	cn_aes_gpu_init(sharedMemory);

-	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-	const int sub = (threadIdx.x & 7) << 2; // 0 4 8 ... 28
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	const uint32_t sub = (threadIdx.x & 7) << 2; // 0 4 8 ... 28

 	if(thread < threads)
 	{
-		const int oft = thread * 50 + sub + 16; // not aligned 16!
-		const int long_oft = (thread << LONG_SHL_IDX) + sub;
-		uint32_t __align__(16) key[40];
-		uint32_t __align__(16) text[4];
+		const uint32_t long_oft = (thread << LONG_SHL_IDX) + sub;
+		ulonglong2 text = AS_UL2(&ctx_state[thread * 52U + sub + 16U]);

-		AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft]);
-		AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]);
+		const uint32_t* ctx_key = &ctx_key1[thread * 40U];
+		uint32_t key[40];
+		#pragma unroll 10 // copy 160 bytes
+		for (uint32_t i = 0; i < 40U; i += 4U)
+			AS_UINT4(&key[i]) = AS_UINT4(&ctx_key[i]);

-		// copy 160 bytes
-		#pragma unroll
-		for (int i = 0; i < 40; i += 4)
-			AS_UINT4(&key[i]) = AS_UINT4(ctx_key1 + thread * 40 + i);
-
-		__syncthreads();
-		for(int i = 0; i < LONG_LOOPS32; i += 32) {
-			cn_aes_pseudo_round_mut(sharedMemory, text, key);
-			AS_UINT4(&long_state[long_oft + i]) = AS_UINT4(text);
+		__threadfence_block();
+
+		for(uint32_t i = 0; i < LONG_LOOPS32; i += 32U) {
+			cn_aes_pseudo_round_mut(sharedMemory, (uint32_t*) &text, key);
+			AS_UL2(&long_state[long_oft + i]) = text;
 		}
 	}
 }

-__global__
-void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
-{
-	__shared__ uint32_t __align__(16) sharedMemory[1024];
-
-	cn_aes_gpu_init(sharedMemory);
-
-	__syncthreads();
-
-#if 0 && __CUDA_ARCH__ >= 300
-
-	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
-	const int sub = threadIdx.x & 3;
+static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, const ulonglong2 &b) {
+	return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
+}
+static __forceinline__ __device__ uint4 operator ^ (const uint4 &a, const uint4 &b) {
+	return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w);
+}

-	if(thread < threads)
+__device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
 {
-		const int batchsize = ITER >> (2 + bfactor);
-		const int start = partidx * batchsize;
-		const int end = start + batchsize;
-		uint32_t * __restrict__ long_state = &d_long_state[thread << 19];
-		uint32_t * __restrict__ ctx_a = d_ctx_a + thread * 4;
-		uint32_t * __restrict__ ctx_b = d_ctx_b + thread * 4;
-		uint32_t a, b, c, x[4];
-		uint32_t t1[4], t2[4], res;
-		uint64_t reshi, reslo;
-		int j;
-
-		a = ctx_a[sub];
-		b = ctx_b[sub];
-
-		#pragma unroll 8
-		for(int i = start; i < end; ++i)
-		{
-			//j = ((uint32_t *)a)[0] & 0x1FFFF0;
-			j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2;
-
-			//cn_aes_single_round(sharedMemory, &long_state[j], c, a);
-			x[0] = long_state[j + sub];
-			x[1] = __shfl((int)x[0], sub + 1, 4);
-			x[2] = __shfl((int)x[0], sub + 2, 4);
-			x[3] = __shfl((int)x[0], sub + 3, 4);
-			c = a ^
-				t_fn0(x[0] & 0xff) ^
-				t_fn1((x[1] >> 8) & 0xff) ^
-				t_fn2((x[2] >> 16) & 0xff) ^
-				t_fn3((x[3] >> 24) & 0xff);
-
-			//XOR_BLOCKS_DST(c, b, &long_state[j]);
-			long_state[j + sub] = c ^ b;
-
-			//MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
-			j = (__shfl((int)c, 0, 4) & E2I_MASK1) >> 2;
-			#pragma unroll
-			for(int k = 0; k < 2; k++)
-				t1[k] = __shfl((int)c, k, 4);
-			#pragma unroll
-			for(int k = 0; k < 4; k++)
-				t2[k] = __shfl((int)a, k, 4);
-			asm(
-				"mad.lo.u64 %0, %2, %3, %4;\n\t"
-				"mad.hi.u64 %1, %2, %3, %5;\n\t"
-				: "=l"(reslo), "=l"(reshi)
-				: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
-			res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0);
-			a = long_state[j + sub] ^ res;
-			long_state[j + sub] = res;
-
-			//j = ((uint32_t *)a)[0] & 0x1FFFF0;
-			j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2;
-
-			//cn_aes_single_round(sharedMemory, &long_state[j], b, a);
-			x[0] = long_state[j + sub];
-			x[1] = __shfl((int)x[0], sub + 1, 4);
-			x[2] = __shfl((int)x[0], sub + 2, 4);
-			x[3] = __shfl((int)x[0], sub + 3, 4);
-			b = a ^
-				t_fn0(x[0] & 0xff) ^
-				t_fn1((x[1] >> 8) & 0xff) ^
-				t_fn2((x[2] >> 16) & 0xff) ^
-				t_fn3((x[3] >> 24) & 0xff);
-
-			//XOR_BLOCKS_DST(b, c, &long_state[j]);
-			long_state[j + sub] = c ^ b;
-
-			//MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
-			j = (__shfl((int)b, 0, 4) & E2I_MASK1) >> 2;
-
-			#pragma unroll
-			for(int k = 0; k < 2; k++)
-				t1[k] = __shfl((int)b, k, 4);
-
-			#pragma unroll
-			for(int k = 0; k < 4; k++)
-				t2[k] = __shfl((int)a, k, 4);
-			asm(
-				"mad.lo.u64 %0, %2, %3, %4;\n\t"
-				"mad.hi.u64 %1, %2, %3, %5;\n\t"
-				: "=l"(reslo), "=l"(reshi)
-				: "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0]));
-			res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0);
-			a = long_state[j + sub] ^ res;
-			long_state[j + sub] = res;
+	ulonglong2 product;
+	product.x = __umul64hi(multiplier, multiplicand);
+	product.y = multiplier * multiplicand;
+	return product;
 }

-		if(bfactor > 0) {
-			ctx_a[sub] = a;
-			ctx_b[sub] = b;
+static __forceinline__ __device__ void operator += (ulonglong2 &a, const ulonglong2 b) {
+	a.x += b.x; a.y += b.y;
 }
+
+#undef MUL_SUM_XOR_DST
+__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst)
+{
+	ulonglong2 d = AS_UL2(far_dst);
+	ulonglong2 p = cuda_mul128(m, d.x);
+	p += AS_UL2(&a);
+	AS_UL2(&a) = p ^ d;
+	AS_UL2(far_dst) = p;
 }

-#else
+__global__
+#if __CUDA_ARCH__ >= 500
+//__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
+#endif
+void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
+	uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+
+//	cn_aes_gpu_init(sharedMemory);
+//	__syncthreads();

-	const int thread = blockDim.x * blockIdx.x + threadIdx.x;
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;

 	if (thread < threads)
 	{
-		const int batchsize = ITER >> (2 + bfactor);
-		const int start = partidx * batchsize;
-		const int end = start + batchsize;
-		const off_t longptr = (off_t)thread << LONG_SHL_IDX;
+		const uint32_t batchsize = ITER >> (2U + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		const uint32_t longptr = thread << LONG_SHL_IDX;
+
 		uint32_t * long_state = &d_long_state[longptr];

-		uint64_t * ctx_a = (uint64_t*)(&d_ctx_a[thread * 4]);
-		uint64_t * ctx_b = (uint64_t*)(&d_ctx_b[thread * 4]);
-		uint4 A = AS_UINT4(ctx_a);
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2U]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2U]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
 		uint4 B = AS_UINT4(ctx_b);
-		uint32_t* a = (uint32_t*)&A;
-		uint32_t* b = (uint32_t*)&B;

 		for (int i = start; i < end; i++) // end = 262144
 		{
-			uint32_t c[4];
-			uint32_t j = (a[0] >> 2) & E2I_MASK2;
-			cn_aes_single_round(sharedMemory, &long_state[j], c, a);
-			XOR_BLOCKS_DST(c, b, &long_state[j]);
-			MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & E2I_MASK2]);
-
-			j = (a[0] >> 2) & E2I_MASK2;
-			cn_aes_single_round(sharedMemory, &long_state[j], b, a);
-			XOR_BLOCKS_DST(b, c, &long_state[j]);
-			MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]);
+			uint4 C;
+
+			uint32_t j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
+			AS_UINT4(&long_state[j]) = C ^ B; // // st.global.u32.v4
+			MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x >> 2U) & E2I_MASK2]);
+
+			j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
+			AS_UINT4(&long_state[j]) = C ^ B;
+			MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x >> 2U) & E2I_MASK2]);
 		}

-		if (bfactor > 0) {
+		if (bfactor) {
 			AS_UINT4(ctx_a) = A;
 			AS_UINT4(ctx_b) = B;
 		}
 	}
-#endif // __CUDA_ARCH__ >= 300
 }

 __global__
-void cryptonight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * ctx_state, uint32_t * __restrict__ ctx_key2)
+void cryptonight_core_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ long_state, uint32_t * ctx_state, uint32_t * __restrict__ ctx_key2)
 {
-	__shared__ uint32_t __align__(16) sharedMemory[1024];
+	__shared__ __align__(16) uint32_t sharedMemory[1024];

-	cn_aes_gpu_init(sharedMemory);
+	//cn_aes_gpu_init(sharedMemory);

-	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-	const int sub = (threadIdx.x & 7) << 2;
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U;
+	const uint32_t sub = (threadIdx.x & 7U) << 2U;

 	if(thread < threads)
 	{
-		const int long_oft = (thread << LONG_SHL_IDX) + sub;
-		const int oft = thread * 50 + sub + 16;
+		const uint32_t long_oft = (thread << LONG_SHL_IDX) + sub;
+		const uint32_t st_oft = thread * 52U + sub + 16U;

-		uint32_t __align__(16) key[40];
-		uint32_t __align__(8) text[4];
+		ulonglong2 text = AS_UL2(&ctx_state[st_oft]);

 		// copy 160 bytes
-		#pragma unroll
-		for (int i = 0; i < 40; i += 4)
-			AS_UINT4(&key[i]) = AS_UINT4(ctx_key2 + thread * 40 + i);
-
-		AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft+0]);
-		AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft+2]);
-
-		__syncthreads();
-		for(int i = 0; i < LONG_LOOPS32; i += 32)
+		uint32_t key[40];
+		const uint32_t* ctx_key = &ctx_key2[thread * 40U];
+		#pragma unroll 10
+		for (uint32_t i = 0; i < 40U; i += 4U)
+			AS_UL2(&key[i]) = AS_UL2(&ctx_key[i]);
+
+		//__syncthreads();
+		for(uint32_t i = 0; i < LONG_LOOPS32; i += 32U)
 		{
-			uint32_t __align__(16) st[4];
-			AS_UINT4(st) = AS_UINT4(&long_state[long_oft + i]);
-
-			#pragma unroll
-			for(int j = 0; j < 4; j++)
-				text[j] ^= st[j];
-
-			cn_aes_pseudo_round_mut(sharedMemory, text, key);
+			ulonglong2 st = AS_UL2(&long_state[long_oft + i]);
+			text = text ^ st;
+			cn_aes_pseudo_round_mut(sharedMemory, (uint32_t*) (&text), key);
 		}

-		AS_UINT2(&ctx_state[oft+0]) = AS_UINT2(&text[0]);
-		AS_UINT2(&ctx_state[oft+2]) = AS_UINT2(&text[2]);
+		AS_UL2(&ctx_state[st_oft]) = text;
 	}
 }

 extern int device_bfactor[MAX_GPUS];

 __host__
-void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state,
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
+	dim3 block2(threads << 1);
 	dim3 block4(threads << 2);
 	dim3 block8(threads << 3);

-	const int bfactor = device_bfactor[thr_id];
-	const int bsleep = bfactor ? 100 : 0;
+	const uint32_t bfactor = (uint32_t) device_bfactor[thr_id];
+	const uint32_t partcount = 1 << bfactor;
+	const uint32_t throughput = (uint32_t) (blocks*threads);

-	int i, partcount = 1 << bfactor;
-	int dev_id = device_map[thr_id];
+	const int bsleep = bfactor ? 100 : 0;
+	const int dev_id = device_map[thr_id];
+	int i;

-	cryptonight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
+	cryptonight_core_gpu_phase1 <<<grid, block8, 4096>>> (throughput, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	if(partcount > 1) usleep(bsleep);

 	for(i = 0; i < partcount; i++)
 	{
-		cryptonight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
+		cryptonight_core_gpu_phase2 <<<grid, b, 4096>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}

-	cryptonight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
+	cryptonight_core_gpu_phase3 <<<grid, block8, 4096>>> (throughput, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
--- a/crypto/cryptonight-extra.cu
+++ b/crypto/cryptonight-extra.cu
@ -75,7 +75,7 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
				@@ -75,7 +75,7 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res

 __global__
 void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
-	uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
 	uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -98,21 +98,21 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
				@@ -98,21 +98,21 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
 		XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
 		XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);

-		memcpy(d_ctx_state + thread * 50, ctx_state, 50 * 4);
-		memcpy(d_ctx_a + thread * 4, ctx_a, 4 * 4);
-		memcpy(d_ctx_b + thread * 4, ctx_b, 4 * 4);
-		memcpy(d_ctx_key1 + thread * 40, ctx_key1, 40 * 4);
-		memcpy(d_ctx_key2 + thread * 40, ctx_key2, 40 * 4);
+		MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25);
+		MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4);
+		MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4);
+		MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40);
+		MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40);
 	}
 }

 __global__
-void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * __restrict__ d_ctx_state)
+void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if(thread < threads)
 	{
-		uint64_t*ctx_state = (uint64_t*) (&d_ctx_state[thread * 50]);
+		uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]);
 		uint64_t state[25];
 		#pragma unroll
 		for(int i = 0; i < 25; i++)
@ -156,7 +156,7 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
				@@ -156,7 +156,7 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if(thread < threads)
 	{
-		uint64_t* const state = &d_ctx_state[thread * 25];
+		uint64_t* const state = &d_ctx_state[thread * 26U];

 		uint32_t hash[8];
 		switch(((uint8_t *)state)[0] & 0x03)
@ -206,7 +206,7 @@ void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
				@@ -206,7 +206,7 @@ void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
 }

 __host__
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
 {
 	uint32_t threadsperblock = 128;

@ -218,7 +218,7 @@ void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startN
				@@ -218,7 +218,7 @@ void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startN
 }

 __host__
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state)
+void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state)
 {
 	uint32_t threadsperblock = 128;

@ -227,9 +227,9 @@ void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNon
				@@ -227,9 +227,9 @@ void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNon

 	cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, d_ctx_state);
+	cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, (uint32_t*)d_ctx_state);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, (uint64_t*)d_ctx_state, d_target[thr_id], d_result[thr_id]);
+	cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
--- a/crypto/cryptonight.cu
+++ b/crypto/cryptonight.cu
@ -7,7 +7,7 @@ static __thread uint32_t cn_blocks  = 32;
				@@ -7,7 +7,7 @@ static __thread uint32_t cn_blocks  = 32;
 static __thread uint32_t cn_threads = 16;

 static uint32_t *d_long_state[MAX_GPUS];
-static uint32_t *d_ctx_state[MAX_GPUS];
+static uint64_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
@ -67,7 +67,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
				@@ -67,7 +67,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_

 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput); // 200 is aligned 8, not 16
+		cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 200 is aligned 8, not 16
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@ -20,12 +20,12 @@ struct uint3  blockDim;
				@@ -20,12 +20,12 @@ struct uint3  blockDim;
 #endif


-#define MEMORY         (1 << 21) // 2 MiB / 2097152 B
-#define ITER           (1 << 20) // 1048576
-#define E2I_MASK1       0x1FFFF0
-#define E2I_MASK2      (0x1FFFF0 >> 2)
+#define MEMORY         (1U << 21) // 2 MiB / 2097152 B
+#define ITER           (1U << 20) // 1048576
+#define E2I_MASK1       0x1FFFF0u
+#define E2I_MASK2      (0x1FFFF0u >> 2u)

-#define AES_BLOCK_SIZE  16
+#define AES_BLOCK_SIZE  16U
 #define AES_KEY_SIZE    32
 #define INIT_SIZE_BLK   8
 #define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B
@ -136,10 +136,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
				@@ -136,10 +136,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 		exit(1);
 	}
 }
-void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);

 void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
 void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
 void cryptonight_extra_cpu_free(int thr_id);
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint32_t *d_ctx_state);
+void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);