xmr: make it more smooth on windows with defaults

also improve a bit the 750 ti on linux...
2025-09-04 10:12:05 +00:00 · 2017-01-12 03:24:12 +01:00 · 2017-01-12 03:24:12 +01:00 · e231343060
commit e231343060
parent 12ae185594
7 changed files with 199 additions and 109 deletions
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -303,6 +303,9 @@
    <ClCompile Include="crypto\oaes_lib.cpp">
      <Filter>Source Files\crypto\xmr</Filter>
    </ClCompile>
+    <ClCompile Include="crypto\cryptolight-cpu.cpp">
+      <Filter>Source Files\crypto\xmr</Filter>
+    </ClCompile>
    <ClCompile Include="crypto\cryptonight-cpu.cpp">
      <Filter>Source Files\crypto\xmr</Filter>
    </ClCompile>
@ -542,6 +545,9 @@
    <ClInclude Include="crypto\cn_skein.cuh">
      <Filter>Source Files\CUDA\xmr</Filter>
    </ClInclude>
+    <ClInclude Include="crypto\cryptolight.h">
+      <Filter>Source Files\CUDA\xmr</Filter>
+    </ClInclude>
    <ClInclude Include="crypto\cryptonight.h">
      <Filter>Source Files\CUDA\xmr</Filter>
    </ClInclude>
--- a/crypto/cn_aes.cuh
+++ b/crypto/cn_aes.cuh
@ -2,7 +2,7 @@
 #define N_COLS          4
 #define WPOLY           0x011b

-static __constant__ uint32_t d_t_fn[1024] = {
+static __constant__ __align__(16) uint32_t d_t_fn[1024] = {
 	0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U,
 	0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU,
 	0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU,
@ -136,10 +136,13 @@ static __constant__ uint32_t d_t_fn[1024] = {
 	0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U
 };

-#define t_fn0(x) (sharedMemory[      (x)])
-#define t_fn1(x) (sharedMemory[256 | (x)])
-#define t_fn2(x) (sharedMemory[512 | (x)])
-#define t_fn3(x) (sharedMemory[768 | (x)])
+#define AS_UINT2(addr) *((uint2*)(addr))
+#define AS_UINT4(addr) *((uint4*)(addr))
+
+#define t_fn0(x) (sharedMemory[x])
+#define t_fn1(x) (sharedMemory[0x100U | (x)])
+#define t_fn2(x) (sharedMemory[0x200U | (x)])
+#define t_fn3(x) (sharedMemory[0x300U | (x)])

 #define round(shared, out, x, k) \
 	out[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24) & 0xff)); \
@ -148,37 +151,72 @@ static __constant__ uint32_t d_t_fn[1024] = {
 	out[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) & 0xff));

 #define round_u4(shared, out, in, k) \
-	out[0] = (k)[0] ^ t_fn0(in[0].x) ^ t_fn1(in[1].y) ^ t_fn2(in[2].z) ^ t_fn3(in[3].w); \
-	out[1] = (k)[1] ^ t_fn0(in[1].x) ^ t_fn1(in[2].y) ^ t_fn2(in[3].z) ^ t_fn3(in[0].w); \
-	out[2] = (k)[2] ^ t_fn0(in[2].x) ^ t_fn1(in[3].y) ^ t_fn2(in[0].z) ^ t_fn3(in[1].w); \
-	out[3] = (k)[3] ^ t_fn0(in[3].x) ^ t_fn1(in[0].y) ^ t_fn2(in[1].z) ^ t_fn3(in[2].w);
+	((uint32_t*)out)[0] = (k)[0] ^ t_fn0(in[0].x) ^ t_fn1(in[1].y) ^ t_fn2(in[2].z) ^ t_fn3(in[3].w); \
+	((uint32_t*)out)[1] = (k)[1] ^ t_fn0(in[1].x) ^ t_fn1(in[2].y) ^ t_fn2(in[3].z) ^ t_fn3(in[0].w); \
+	((uint32_t*)out)[2] = (k)[2] ^ t_fn0(in[2].x) ^ t_fn1(in[3].y) ^ t_fn2(in[0].z) ^ t_fn3(in[1].w); \
+	((uint32_t*)out)[3] = (k)[3] ^ t_fn0(in[3].x) ^ t_fn1(in[0].y) ^ t_fn2(in[1].z) ^ t_fn3(in[2].w);
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(a,b,c) a
+#endif
+
+#define OFF8_0(x) (x & 0xFFu)
+#define OFF8_1(x) __byte_perm(x, 0x01, 0x5541)
+#define OFF8_2(x) __byte_perm(x, 0x02, 0x5542)
+#define OFF8_3(x) __byte_perm(x, 0x03, 0x5543)
+
+#define SHARED_0(x) sharedMemory[OFF8_0(x)]
+#define SHARED_1(x) sharedMemory[OFF8_1(x)]
+#define SHARED_2(x) sharedMemory[OFF8_2(x)]
+#define SHARED_3(x) sharedMemory[OFF8_3(x)]

 __device__ __forceinline__
-void cn_aes_single_round(uint32_t * const sharedMemory, uint32_t * const in32, uint32_t * out, uint32_t * const expandedKey)
+void cn_aes_single_round(uint32_t * const sharedMemory, uint32_t * const in, uint32_t * out, uint32_t* expandedKey)
 {
-	uchar4* in = (uchar4*) in32;
-	out[0] = expandedKey[0] ^ t_fn0(in[0].x) ^ t_fn1(in[1].y) ^ t_fn2(in[2].z) ^ t_fn3(in[3].w);
-	out[1] = expandedKey[1] ^ t_fn0(in[1].x) ^ t_fn1(in[2].y) ^ t_fn2(in[3].z) ^ t_fn3(in[0].w);
-	out[2] = expandedKey[2] ^ t_fn0(in[2].x) ^ t_fn1(in[3].y) ^ t_fn2(in[0].z) ^ t_fn3(in[1].w);
-	out[3] = expandedKey[3] ^ t_fn0(in[3].x) ^ t_fn1(in[0].y) ^ t_fn2(in[1].z) ^ t_fn3(in[2].w);
-	//round(sharedMemory, out, in32, expandedKey);
+	asm("// aes_single_round");
+	out[0] = expandedKey[0] ^ SHARED_0(in[0]) ^ SHARED_1(in[1]) ^ SHARED_2(in[2]) ^ SHARED_3(in[3]);
+	out[1] = expandedKey[1] ^ SHARED_0(in[1]) ^ SHARED_1(in[2]) ^ SHARED_2(in[3]) ^ SHARED_3(in[0]);
+	out[2] = expandedKey[2] ^ SHARED_0(in[2]) ^ SHARED_1(in[3]) ^ SHARED_2(in[0]) ^ SHARED_3(in[1]);
+	out[3] = expandedKey[3] ^ SHARED_0(in[3]) ^ SHARED_1(in[0]) ^ SHARED_2(in[1]) ^ SHARED_3(in[2]);
 }

+
+#define round_perm(shared, out, in, k) \
+	out[0] = (k)[0] ^ SHARED_0(in[0]) ^ SHARED_1(in[1]) ^ SHARED_2(in[2]) ^ SHARED_3(in[3]); \
+	out[1] = (k)[1] ^ SHARED_0(in[1]) ^ SHARED_1(in[2]) ^ SHARED_2(in[3]) ^ SHARED_3(in[0]); \
+	out[2] = (k)[2] ^ SHARED_0(in[2]) ^ SHARED_1(in[3]) ^ SHARED_2(in[0]) ^ SHARED_3(in[1]); \
+	out[3] = (k)[3] ^ SHARED_0(in[3]) ^ SHARED_1(in[0]) ^ SHARED_2(in[1]) ^ SHARED_3(in[2]);
+
 __device__ __forceinline__
 void cn_aes_pseudo_round_mut(const uint32_t * sharedMemory, uint32_t * val, const uint32_t * expandedKey)
 {
+	asm("// aes_pseudo_round_mut");
+#if 0
+	uchar4 x[4];
+	uchar4* in = (uchar4*)val;
+	round_u4(sharedMemory, x, in, expandedKey);
+	round_u4(sharedMemory, in, x, expandedKey + (1 * N_COLS));
+	round_u4(sharedMemory, x, in, expandedKey + (2 * N_COLS));
+	round_u4(sharedMemory, in, x, expandedKey + (3 * N_COLS));
+	round_u4(sharedMemory, x, in, expandedKey + (4 * N_COLS));
+	round_u4(sharedMemory, in, x, expandedKey + (5 * N_COLS));
+	round_u4(sharedMemory, x, in, expandedKey + (6 * N_COLS));
+	round_u4(sharedMemory, in, x, expandedKey + (7 * N_COLS));
+	round_u4(sharedMemory, x, in, expandedKey + (8 * N_COLS));
+	round_u4(sharedMemory, val,x, expandedKey + (9 * N_COLS));
+#else
 	uint32_t b[4];
-	uchar4* x = (uchar4*) b;
-	round(sharedMemory, b, val, expandedKey);
-	round_u4(sharedMemory, val, x, expandedKey + 1 * N_COLS);
-	round(sharedMemory, b, val, expandedKey + 2 * N_COLS);
-	round_u4(sharedMemory, val, x, expandedKey + 3 * N_COLS);
-	round(sharedMemory, b, val, expandedKey + 4 * N_COLS);
-	round_u4(sharedMemory, val, x, expandedKey + 5 * N_COLS);
-	round(sharedMemory, b, val, expandedKey + 6 * N_COLS);
-	round_u4(sharedMemory, val, x, expandedKey + 7 * N_COLS);
-	round(sharedMemory, b, val, expandedKey + 8 * N_COLS);
-	round_u4(sharedMemory, val, x, expandedKey + 9 * N_COLS);
+	round_perm(sharedMemory, b, val, expandedKey);
+	round_perm(sharedMemory, val, b, expandedKey + (1 * N_COLS));
+	round_perm(sharedMemory, b, val, expandedKey + (2 * N_COLS));
+	round_perm(sharedMemory, val, b, expandedKey + (3 * N_COLS));
+	round_perm(sharedMemory, b, val, expandedKey + (4 * N_COLS));
+	round_perm(sharedMemory, val, b, expandedKey + (5 * N_COLS));
+	round_perm(sharedMemory, b, val, expandedKey + (6 * N_COLS));
+	round_perm(sharedMemory, val, b, expandedKey + (7 * N_COLS));
+	round_perm(sharedMemory, b, val, expandedKey + (8 * N_COLS));
+	round_perm(sharedMemory, val, b, expandedKey + (9 * N_COLS));
+#endif
 }

 __device__ __forceinline__
@ -186,22 +224,31 @@ void cn_aes_gpu_init(uint32_t *sharedMemory)
 {
 	if(blockDim.x >= 32)
 	{
-		if(threadIdx.x < 32)
-		{
-			for(int i = 0; i < 1024; i += 32)
-			{
+		if(threadIdx.x < 32) {
+#if 0
+			#pragma unroll 32
+			for(uint32_t i = 0; i < 1024; i += 32)
 				sharedMemory[threadIdx.x + i] = d_t_fn[threadIdx.x + i];
-			}
+#else
+			#define thrX (threadIdx.x << 2) // ensure offsets aligned (16) to vector
+			#pragma unroll 8
+			for (uint32_t i = 0; i < 1024; i += 128) // 32x32 = 1024, 8 * 128 also
+				AS_UINT4(&sharedMemory[i + thrX]) = AS_UINT4(&d_t_fn[i + thrX]);
+#endif
 		}

 	} else {

-		if(threadIdx.x < 4)
-		{
-			for(int i = 0; i < 1024; i += 4)
-			{
+		if(threadIdx.x < 4) {
+#if 0
+			for (uint32_t i = 0; i < 1024; i += 4)
 				sharedMemory[threadIdx.x + i] = d_t_fn[threadIdx.x + i];
-			}
+#else
+			#define thrX (threadIdx.x << 2) // ensure offsets aligned (16) to vector
+			#pragma unroll 64
+			for (uint32_t i = 0; i < 1024; i += 16)
+				AS_UINT4(&sharedMemory[i + thrX]) = AS_UINT4(&d_t_fn[i + thrX]);
+#endif
 		}
 	}
 }
--- a/crypto/cryptolight-core.cu
+++ b/crypto/cryptolight-core.cu
@ -11,8 +11,8 @@
 #define LONG_SHL_IDX 18
 #define LONG_LOOPS32 0x40000

-#ifdef WIN32 /* todo: --interactive */
-static __thread int cn_bfactor = 8;
+#ifdef WIN32
+static __thread int cn_bfactor = 11;
 static __thread int cn_bsleep = 100;
 #else
 static __thread int cn_bfactor = 0;
@ -38,7 +38,7 @@ __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t mu
 __global__
 void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t __align__(16) sharedMemory[1024];

 	cn_aes_gpu_init(sharedMemory);

@ -47,16 +47,23 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state

 	if(thread < threads)
 	{
-		uint32_t key[40], text[4];
+		const int oft = thread * 50 + sub + 16; // not aligned 16!
+		const int long_oft = (thread << LONG_SHL_IDX) + sub;
+		uint32_t __align__(16) key[40];
+		uint32_t __align__(16) text[4];

-		MEMCPY8(key, ctx_key1 + thread * 40, 20);
-		MEMCPY8(text, ctx_state + thread * 50 + sub + 16, 2);
+		// copy 160 bytes
+		#pragma unroll
+		for (int i = 0; i < 40; i += 4)
+			AS_UINT4(&key[i]) = AS_UINT4(ctx_key1 + thread * 40 + i);
+
+		AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft]);
+		AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]);

 		__syncthreads();
-		for(int i = 0; i < LONG_LOOPS32; i += 32)
-		{
+		for(int i = 0; i < LONG_LOOPS32; i += 32) {
 			cn_aes_pseudo_round_mut(sharedMemory, text, key);
-			MEMCPY8(&long_state[(thread << LONG_SHL_IDX) + sub + i], text, 2);
+			AS_UINT4(&long_state[long_oft + i]) = AS_UINT4(text);
 		}
 	}
 }
@ -64,7 +71,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state
 __global__
 void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t __align__(16) sharedMemory[1024];

 	cn_aes_gpu_init(sharedMemory);

@ -181,14 +188,15 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
 		const int batchsize = ITER >> (2 + bfactor);
 		const int start = partidx * batchsize;
 		const int end = start + batchsize;
-		const off_t longptr = (off_t) thread << LONG_SHL_IDX;
+		const int longptr = thread << LONG_SHL_IDX;
 		uint32_t * long_state = &d_long_state[longptr];
-		uint32_t * ctx_a = &d_ctx_a[thread * 4];
-		uint32_t * ctx_b = &d_ctx_b[thread * 4];
-		uint32_t a[4], b[4];

-		MEMCPY8(a, ctx_a, 2);
-		MEMCPY8(b, ctx_b, 2);
+		uint64_t * ctx_a = (uint64_t*)(&d_ctx_a[thread * 4]);
+		uint64_t * ctx_b = (uint64_t*)(&d_ctx_b[thread * 4]);
+		uint4 A = AS_UINT4(ctx_a);
+		uint4 B = AS_UINT4(ctx_b);
+		uint32_t* a = (uint32_t*)&A;
+		uint32_t* b = (uint32_t*)&B;

 		for (int i = start; i < end; i++) // end = 262144
 		{
@ -204,43 +212,50 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
 			MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]);
 		}

-		if(bfactor > 0)
-		{
-			MEMCPY8(ctx_a, a, 2);
-			MEMCPY8(ctx_b, b, 2);
+		if (bfactor > 0) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
 		}
 	}
-
 #endif // __CUDA_ARCH__ >= 300
 }

 __global__
-void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2)
+void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key2)
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t __align__(16) sharedMemory[1024];

 	cn_aes_gpu_init(sharedMemory);

-	int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-	int sub = (threadIdx.x & 7) << 2;
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	const int sub = (threadIdx.x & 7) << 2;

 	if(thread < threads)
 	{
-		uint32_t key[40], text[4];
-		MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
-		MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
+		const int long_oft = (thread << LONG_SHL_IDX) + sub;
+		const int oft = thread * 50 + sub + 16;
+		uint32_t __align__(16) key[40];
+		uint32_t __align__(8) text[4];
+
+		#pragma unroll
+		for (int i = 0; i < 40; i += 4)
+			AS_UINT4(&key[i]) = AS_UINT4(ctx_key2 + thread * 40 + i);
+
+		AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft + 0]);
+		AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]);

 		__syncthreads();
 		for(int i = 0; i < LONG_LOOPS32; i += 32)
 		{
 			#pragma unroll
 			for(int j = 0; j < 4; j++)
-				text[j] ^= long_state[(thread << LONG_SHL_IDX) + sub + i + j];
+				text[j] ^= long_state[long_oft + i + j];

 			cn_aes_pseudo_round_mut(sharedMemory, text, key);
 		}

-		MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
+		AS_UINT2(&ctx_state[oft + 0]) = AS_UINT2(&text[0]);
+		AS_UINT2(&ctx_state[oft + 2]) = AS_UINT2(&text[2]);
 	}
 }

--- a/crypto/cryptolight.h
+++ b/crypto/cryptolight.h
@ -16,6 +16,7 @@ struct uint3  blockDim;
 #define __syncthreads()
 #define asm(x)
 #define __shfl(a,b,c) 1
+#define __umul64hi(a,b) a*b
 #endif

 #define MEMORY         (1UL << 20) /* 1 MiB - 1048576 */
--- a/crypto/cryptonight-core.cu
+++ b/crypto/cryptonight-core.cu
@ -9,10 +9,11 @@

 #include "cryptonight.h"
 #define LONG_SHL_IDX 19
+#define LONG_LOOPS32 0x80000

 #ifdef WIN32
 // to prevent ui freeze
-static __thread int cn_bfactor = 8;
+static __thread int cn_bfactor = 11;
 static __thread int cn_bsleep = 100;
 #else
 static __thread int cn_bfactor = 0;
@ -38,25 +39,32 @@ __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t mu
 __global__
 void cryptonight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t __align__(16) sharedMemory[1024];

 	cn_aes_gpu_init(sharedMemory);

 	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-	const int sub = (threadIdx.x & 7) << 2;
+	const int sub = (threadIdx.x & 7) << 2; // 0 4 8 ... 28

 	if(thread < threads)
 	{
-		uint32_t key[40], text[4];
+		const int oft = thread * 50 + sub + 16; // not aligned 16!
+		const int long_oft = (thread << LONG_SHL_IDX) + sub;
+		uint32_t __align__(16) key[40];
+		uint32_t __align__(16) text[4];

-		MEMCPY8(key, ctx_key1 + thread * 40, 20);
-		MEMCPY8(text, ctx_state + thread * 50 + sub + 16, 2);
+		AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft]);
+		AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]);
+
+		// copy 160 bytes
+		#pragma unroll
+		for (int i = 0; i < 40; i += 4)
+			AS_UINT4(&key[i]) = AS_UINT4(ctx_key1 + thread * 40 + i);

 		__syncthreads();
-		for(int i = 0; i < 0x80000; i += 32)
-		{
+		for(int i = 0; i < LONG_LOOPS32; i += 32) {
 			cn_aes_pseudo_round_mut(sharedMemory, text, key);
-			MEMCPY8(&long_state[(thread << 19) + sub + i], text, 2);
+			AS_UINT4(&long_state[long_oft + i]) = AS_UINT4(text);
 		}
 	}
 }
@ -64,7 +72,7 @@ void cryptonight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state
 __global__
 void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t __align__(16) sharedMemory[1024];

 	cn_aes_gpu_init(sharedMemory);

@ -165,14 +173,13 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int
 			long_state[j + sub] = res;
 		}

-		if(bfactor > 0)
-		{
+		if(bfactor > 0) {
 			ctx_a[sub] = a;
 			ctx_b[sub] = b;
 		}
 	}

-#else // __CUDA_ARCH__ < 300
+#else

 	const int thread = blockDim.x * blockIdx.x + threadIdx.x;

@ -181,14 +188,15 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int
 		const int batchsize = ITER >> (2 + bfactor);
 		const int start = partidx * batchsize;
 		const int end = start + batchsize;
-		const off_t longptr = (off_t) thread << 19;
+		const off_t longptr = (off_t)thread << LONG_SHL_IDX;
 		uint32_t * long_state = &d_long_state[longptr];
-		uint32_t * ctx_a = &d_ctx_a[thread * 4];
-		uint32_t * ctx_b = &d_ctx_b[thread * 4];
-		uint32_t a[4], b[4];

-		MEMCPY8(a, ctx_a, 2);
-		MEMCPY8(b, ctx_b, 2);
+		uint64_t * ctx_a = (uint64_t*)(&d_ctx_a[thread * 4]);
+		uint64_t * ctx_b = (uint64_t*)(&d_ctx_b[thread * 4]);
+		uint4 A = AS_UINT4(ctx_a);
+		uint4 B = AS_UINT4(ctx_b);
+		uint32_t* a = (uint32_t*)&A;
+		uint32_t* b = (uint32_t*)&B;

 		for (int i = start; i < end; i++) // end = 262144
 		{
@ -204,43 +212,55 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int
 			MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]);
 		}

-		if(bfactor > 0)
-		{
-			MEMCPY8(ctx_a, a, 2);
-			MEMCPY8(ctx_b, b, 2);
+		if (bfactor > 0) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
 		}
 	}
-
 #endif // __CUDA_ARCH__ >= 300
 }

 __global__
-void cryptonight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2)
+void cryptonight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * ctx_state, uint32_t * __restrict__ ctx_key2)
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t __align__(16) sharedMemory[1024];

 	cn_aes_gpu_init(sharedMemory);

-	int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-	int sub = (threadIdx.x & 7) << 2;
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
+	const int sub = (threadIdx.x & 7) << 2;

 	if(thread < threads)
 	{
-		uint32_t key[40], text[4];
-		MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
-		MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
+		const int long_oft = (thread << LONG_SHL_IDX) + sub;
+		const int oft = thread * 50 + sub + 16;
+
+		uint32_t __align__(16) key[40];
+		uint32_t __align__(8) text[4];
+
+		// copy 160 bytes
+		#pragma unroll
+		for (int i = 0; i < 40; i += 4)
+			AS_UINT4(&key[i]) = AS_UINT4(ctx_key2 + thread * 40 + i);
+
+		AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft+0]);
+		AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft+2]);

 		__syncthreads();
-		for(int i = 0; i < 0x80000; i += 32)
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
 		{
+			uint32_t __align__(16) st[4];
+			AS_UINT4(st) = AS_UINT4(&long_state[long_oft + i]);
+
 			#pragma unroll
-			for(int j = 0; j < 4; ++j)
-				text[j] ^= long_state[(thread << 19) + sub + i + j];
+			for(int j = 0; j < 4; j++)
+				text[j] ^= st[j];

 			cn_aes_pseudo_round_mut(sharedMemory, text, key);
 		}

-		MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
+		AS_UINT2(&ctx_state[oft+0]) = AS_UINT2(&text[0]);
+		AS_UINT2(&ctx_state[oft+2]) = AS_UINT2(&text[2]);
 	}
 }

--- a/crypto/cryptonight.cu
+++ b/crypto/cryptonight.cu
@ -67,7 +67,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_

 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
+		cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput); // 200 is aligned 8, not 16
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@ -16,6 +16,7 @@ struct uint3  blockDim;
 #define __syncthreads()
 #define asm(x)
 #define __shfl(a,b,c) 1
+#define __umul64hi(a,b) a*b
 #endif