diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 9e943b3..43a07c7 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -303,6 +303,9 @@ Source Files\crypto\xmr + + Source Files\crypto\xmr + Source Files\crypto\xmr @@ -542,6 +545,9 @@ Source Files\CUDA\xmr + + Source Files\CUDA\xmr + Source Files\CUDA\xmr diff --git a/crypto/cn_aes.cuh b/crypto/cn_aes.cuh index 831d909..1ccec9e 100644 --- a/crypto/cn_aes.cuh +++ b/crypto/cn_aes.cuh @@ -2,7 +2,7 @@ #define N_COLS 4 #define WPOLY 0x011b -static __constant__ uint32_t d_t_fn[1024] = { +static __constant__ __align__(16) uint32_t d_t_fn[1024] = { 0xa56363c6U, 0x847c7cf8U, 0x997777eeU, 0x8d7b7bf6U, 0x0df2f2ffU, 0xbd6b6bd6U, 0xb16f6fdeU, 0x54c5c591U, 0x50303060U, 0x03010102U, 0xa96767ceU, 0x7d2b2b56U, 0x19fefee7U, 0x62d7d7b5U, 0xe6abab4dU, 0x9a7676ecU, 0x45caca8fU, 0x9d82821fU, 0x40c9c989U, 0x877d7dfaU, 0x15fafaefU, 0xeb5959b2U, 0xc947478eU, 0x0bf0f0fbU, @@ -136,10 +136,13 @@ static __constant__ uint32_t d_t_fn[1024] = { 0x82c34141U, 0x29b09999U, 0x5a772d2dU, 0x1e110f0fU, 0x7bcbb0b0U, 0xa8fc5454U, 0x6dd6bbbbU, 0x2c3a1616U }; -#define t_fn0(x) (sharedMemory[ (x)]) -#define t_fn1(x) (sharedMemory[256 | (x)]) -#define t_fn2(x) (sharedMemory[512 | (x)]) -#define t_fn3(x) (sharedMemory[768 | (x)]) +#define AS_UINT2(addr) *((uint2*)(addr)) +#define AS_UINT4(addr) *((uint4*)(addr)) + +#define t_fn0(x) (sharedMemory[x]) +#define t_fn1(x) (sharedMemory[0x100U | (x)]) +#define t_fn2(x) (sharedMemory[0x200U | (x)]) +#define t_fn3(x) (sharedMemory[0x300U | (x)]) #define round(shared, out, x, k) \ out[0] = (k)[0] ^ (t_fn0(x[0] & 0xff) ^ t_fn1((x[1] >> 8) & 0xff) ^ t_fn2((x[2] >> 16) & 0xff) ^ t_fn3((x[3] >> 24) & 0xff)); \ @@ -148,60 +151,104 @@ static __constant__ uint32_t d_t_fn[1024] = { out[3] = (k)[3] ^ (t_fn0(x[3] & 0xff) ^ t_fn1((x[0] >> 8) & 0xff) ^ t_fn2((x[1] >> 16) & 0xff) ^ t_fn3((x[2] >> 24) & 0xff)); #define round_u4(shared, out, in, k) \ - out[0] = (k)[0] ^ t_fn0(in[0].x) ^ t_fn1(in[1].y) ^ t_fn2(in[2].z) ^ t_fn3(in[3].w); \ - out[1] = (k)[1] ^ t_fn0(in[1].x) ^ t_fn1(in[2].y) ^ t_fn2(in[3].z) ^ t_fn3(in[0].w); \ - out[2] = (k)[2] ^ t_fn0(in[2].x) ^ t_fn1(in[3].y) ^ t_fn2(in[0].z) ^ t_fn3(in[1].w); \ - out[3] = (k)[3] ^ t_fn0(in[3].x) ^ t_fn1(in[0].y) ^ t_fn2(in[1].z) ^ t_fn3(in[2].w); + ((uint32_t*)out)[0] = (k)[0] ^ t_fn0(in[0].x) ^ t_fn1(in[1].y) ^ t_fn2(in[2].z) ^ t_fn3(in[3].w); \ + ((uint32_t*)out)[1] = (k)[1] ^ t_fn0(in[1].x) ^ t_fn1(in[2].y) ^ t_fn2(in[3].z) ^ t_fn3(in[0].w); \ + ((uint32_t*)out)[2] = (k)[2] ^ t_fn0(in[2].x) ^ t_fn1(in[3].y) ^ t_fn2(in[0].z) ^ t_fn3(in[1].w); \ + ((uint32_t*)out)[3] = (k)[3] ^ t_fn0(in[3].x) ^ t_fn1(in[0].y) ^ t_fn2(in[1].z) ^ t_fn3(in[2].w); + +#ifdef __INTELLISENSE__ +#define __byte_perm(a,b,c) a +#endif + +#define OFF8_0(x) (x & 0xFFu) +#define OFF8_1(x) __byte_perm(x, 0x01, 0x5541) +#define OFF8_2(x) __byte_perm(x, 0x02, 0x5542) +#define OFF8_3(x) __byte_perm(x, 0x03, 0x5543) + +#define SHARED_0(x) sharedMemory[OFF8_0(x)] +#define SHARED_1(x) sharedMemory[OFF8_1(x)] +#define SHARED_2(x) sharedMemory[OFF8_2(x)] +#define SHARED_3(x) sharedMemory[OFF8_3(x)] __device__ __forceinline__ -void cn_aes_single_round(uint32_t * const sharedMemory, uint32_t * const in32, uint32_t * out, uint32_t * const expandedKey) +void cn_aes_single_round(uint32_t * const sharedMemory, uint32_t * const in, uint32_t * out, uint32_t* expandedKey) { - uchar4* in = (uchar4*) in32; - out[0] = expandedKey[0] ^ t_fn0(in[0].x) ^ t_fn1(in[1].y) ^ t_fn2(in[2].z) ^ t_fn3(in[3].w); - out[1] = expandedKey[1] ^ t_fn0(in[1].x) ^ t_fn1(in[2].y) ^ t_fn2(in[3].z) ^ t_fn3(in[0].w); - out[2] = expandedKey[2] ^ t_fn0(in[2].x) ^ t_fn1(in[3].y) ^ t_fn2(in[0].z) ^ t_fn3(in[1].w); - out[3] = expandedKey[3] ^ t_fn0(in[3].x) ^ t_fn1(in[0].y) ^ t_fn2(in[1].z) ^ t_fn3(in[2].w); - //round(sharedMemory, out, in32, expandedKey); + asm("// aes_single_round"); + out[0] = expandedKey[0] ^ SHARED_0(in[0]) ^ SHARED_1(in[1]) ^ SHARED_2(in[2]) ^ SHARED_3(in[3]); + out[1] = expandedKey[1] ^ SHARED_0(in[1]) ^ SHARED_1(in[2]) ^ SHARED_2(in[3]) ^ SHARED_3(in[0]); + out[2] = expandedKey[2] ^ SHARED_0(in[2]) ^ SHARED_1(in[3]) ^ SHARED_2(in[0]) ^ SHARED_3(in[1]); + out[3] = expandedKey[3] ^ SHARED_0(in[3]) ^ SHARED_1(in[0]) ^ SHARED_2(in[1]) ^ SHARED_3(in[2]); } + +#define round_perm(shared, out, in, k) \ + out[0] = (k)[0] ^ SHARED_0(in[0]) ^ SHARED_1(in[1]) ^ SHARED_2(in[2]) ^ SHARED_3(in[3]); \ + out[1] = (k)[1] ^ SHARED_0(in[1]) ^ SHARED_1(in[2]) ^ SHARED_2(in[3]) ^ SHARED_3(in[0]); \ + out[2] = (k)[2] ^ SHARED_0(in[2]) ^ SHARED_1(in[3]) ^ SHARED_2(in[0]) ^ SHARED_3(in[1]); \ + out[3] = (k)[3] ^ SHARED_0(in[3]) ^ SHARED_1(in[0]) ^ SHARED_2(in[1]) ^ SHARED_3(in[2]); + __device__ __forceinline__ void cn_aes_pseudo_round_mut(const uint32_t * sharedMemory, uint32_t * val, const uint32_t * expandedKey) { + asm("// aes_pseudo_round_mut"); +#if 0 + uchar4 x[4]; + uchar4* in = (uchar4*)val; + round_u4(sharedMemory, x, in, expandedKey); + round_u4(sharedMemory, in, x, expandedKey + (1 * N_COLS)); + round_u4(sharedMemory, x, in, expandedKey + (2 * N_COLS)); + round_u4(sharedMemory, in, x, expandedKey + (3 * N_COLS)); + round_u4(sharedMemory, x, in, expandedKey + (4 * N_COLS)); + round_u4(sharedMemory, in, x, expandedKey + (5 * N_COLS)); + round_u4(sharedMemory, x, in, expandedKey + (6 * N_COLS)); + round_u4(sharedMemory, in, x, expandedKey + (7 * N_COLS)); + round_u4(sharedMemory, x, in, expandedKey + (8 * N_COLS)); + round_u4(sharedMemory, val,x, expandedKey + (9 * N_COLS)); +#else uint32_t b[4]; - uchar4* x = (uchar4*) b; - round(sharedMemory, b, val, expandedKey); - round_u4(sharedMemory, val, x, expandedKey + 1 * N_COLS); - round(sharedMemory, b, val, expandedKey + 2 * N_COLS); - round_u4(sharedMemory, val, x, expandedKey + 3 * N_COLS); - round(sharedMemory, b, val, expandedKey + 4 * N_COLS); - round_u4(sharedMemory, val, x, expandedKey + 5 * N_COLS); - round(sharedMemory, b, val, expandedKey + 6 * N_COLS); - round_u4(sharedMemory, val, x, expandedKey + 7 * N_COLS); - round(sharedMemory, b, val, expandedKey + 8 * N_COLS); - round_u4(sharedMemory, val, x, expandedKey + 9 * N_COLS); + round_perm(sharedMemory, b, val, expandedKey); + round_perm(sharedMemory, val, b, expandedKey + (1 * N_COLS)); + round_perm(sharedMemory, b, val, expandedKey + (2 * N_COLS)); + round_perm(sharedMemory, val, b, expandedKey + (3 * N_COLS)); + round_perm(sharedMemory, b, val, expandedKey + (4 * N_COLS)); + round_perm(sharedMemory, val, b, expandedKey + (5 * N_COLS)); + round_perm(sharedMemory, b, val, expandedKey + (6 * N_COLS)); + round_perm(sharedMemory, val, b, expandedKey + (7 * N_COLS)); + round_perm(sharedMemory, b, val, expandedKey + (8 * N_COLS)); + round_perm(sharedMemory, val, b, expandedKey + (9 * N_COLS)); +#endif } __device__ __forceinline__ -void cn_aes_gpu_init(uint32_t *sharedMemory) +void cn_aes_gpu_init(uint32_t* sharedMemory) { if(blockDim.x >= 32) { - if(threadIdx.x < 32) - { - for(int i = 0; i < 1024; i += 32) - { + if(threadIdx.x < 32) { +#if 0 + #pragma unroll 32 + for(uint32_t i = 0; i < 1024; i += 32) sharedMemory[threadIdx.x + i] = d_t_fn[threadIdx.x + i]; - } +#else + #define thrX (threadIdx.x << 2) // ensure offsets aligned (16) to vector + #pragma unroll 8 + for (uint32_t i = 0; i < 1024; i += 128) // 32x32 = 1024, 8 * 128 also + AS_UINT4(&sharedMemory[i + thrX]) = AS_UINT4(&d_t_fn[i + thrX]); +#endif } } else { - if(threadIdx.x < 4) - { - for(int i = 0; i < 1024; i += 4) - { + if(threadIdx.x < 4) { +#if 0 + for (uint32_t i = 0; i < 1024; i += 4) sharedMemory[threadIdx.x + i] = d_t_fn[threadIdx.x + i]; - } +#else + #define thrX (threadIdx.x << 2) // ensure offsets aligned (16) to vector + #pragma unroll 64 + for (uint32_t i = 0; i < 1024; i += 16) + AS_UINT4(&sharedMemory[i + thrX]) = AS_UINT4(&d_t_fn[i + thrX]); +#endif } } -} +} \ No newline at end of file diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu index b8ccc6d..26cb05c 100644 --- a/crypto/cryptolight-core.cu +++ b/crypto/cryptolight-core.cu @@ -11,8 +11,8 @@ #define LONG_SHL_IDX 18 #define LONG_LOOPS32 0x40000 -#ifdef WIN32 /* todo: --interactive */ -static __thread int cn_bfactor = 8; +#ifdef WIN32 +static __thread int cn_bfactor = 11; static __thread int cn_bsleep = 100; #else static __thread int cn_bfactor = 0; @@ -38,7 +38,7 @@ __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t mu __global__ void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t __align__(16) sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); @@ -47,16 +47,23 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state if(thread < threads) { - uint32_t key[40], text[4]; + const int oft = thread * 50 + sub + 16; // not aligned 16! + const int long_oft = (thread << LONG_SHL_IDX) + sub; + uint32_t __align__(16) key[40]; + uint32_t __align__(16) text[4]; - MEMCPY8(key, ctx_key1 + thread * 40, 20); - MEMCPY8(text, ctx_state + thread * 50 + sub + 16, 2); + // copy 160 bytes + #pragma unroll + for (int i = 0; i < 40; i += 4) + AS_UINT4(&key[i]) = AS_UINT4(ctx_key1 + thread * 40 + i); + + AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft]); + AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]); __syncthreads(); - for(int i = 0; i < LONG_LOOPS32; i += 32) - { + for(int i = 0; i < LONG_LOOPS32; i += 32) { cn_aes_pseudo_round_mut(sharedMemory, text, key); - MEMCPY8(&long_state[(thread << LONG_SHL_IDX) + sub + i], text, 2); + AS_UINT4(&long_state[long_oft + i]) = AS_UINT4(text); } } } @@ -64,7 +71,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state __global__ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t __align__(16) sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); @@ -176,21 +183,22 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int const int thread = blockDim.x * blockIdx.x + threadIdx.x; - if(thread < threads) + if (thread < threads) { const int batchsize = ITER >> (2 + bfactor); const int start = partidx * batchsize; const int end = start + batchsize; - const off_t longptr = (off_t) thread << LONG_SHL_IDX; + const int longptr = thread << LONG_SHL_IDX; uint32_t * long_state = &d_long_state[longptr]; - uint32_t * ctx_a = &d_ctx_a[thread * 4]; - uint32_t * ctx_b = &d_ctx_b[thread * 4]; - uint32_t a[4], b[4]; - MEMCPY8(a, ctx_a, 2); - MEMCPY8(b, ctx_b, 2); + uint64_t * ctx_a = (uint64_t*)(&d_ctx_a[thread * 4]); + uint64_t * ctx_b = (uint64_t*)(&d_ctx_b[thread * 4]); + uint4 A = AS_UINT4(ctx_a); + uint4 B = AS_UINT4(ctx_b); + uint32_t* a = (uint32_t*)&A; + uint32_t* b = (uint32_t*)&B; - for(int i = start; i < end; i++) // end = 262144 + for (int i = start; i < end; i++) // end = 262144 { uint32_t c[4]; uint32_t j = (a[0] >> 2) & E2I_MASK2; @@ -204,43 +212,50 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]); } - if(bfactor > 0) - { - MEMCPY8(ctx_a, a, 2); - MEMCPY8(ctx_b, b, 2); + if (bfactor > 0) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; } } - #endif // __CUDA_ARCH__ >= 300 } __global__ -void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2) +void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key2) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t __align__(16) sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); - int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; - int sub = (threadIdx.x & 7) << 2; + const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + const int sub = (threadIdx.x & 7) << 2; if(thread < threads) { - uint32_t key[40], text[4]; - MEMCPY8(key, d_ctx_key2 + thread * 40, 20); - MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2); + const int long_oft = (thread << LONG_SHL_IDX) + sub; + const int oft = thread * 50 + sub + 16; + uint32_t __align__(16) key[40]; + uint32_t __align__(8) text[4]; + + #pragma unroll + for (int i = 0; i < 40; i += 4) + AS_UINT4(&key[i]) = AS_UINT4(ctx_key2 + thread * 40 + i); + + AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft + 0]); + AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]); __syncthreads(); for(int i = 0; i < LONG_LOOPS32; i += 32) { #pragma unroll for(int j = 0; j < 4; j++) - text[j] ^= long_state[(thread << LONG_SHL_IDX) + sub + i + j]; + text[j] ^= long_state[long_oft + i + j]; cn_aes_pseudo_round_mut(sharedMemory, text, key); } - MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2); + AS_UINT2(&ctx_state[oft + 0]) = AS_UINT2(&text[0]); + AS_UINT2(&ctx_state[oft + 2]) = AS_UINT2(&text[2]); } } diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h index 415e9a8..830c66b 100644 --- a/crypto/cryptolight.h +++ b/crypto/cryptolight.h @@ -16,6 +16,7 @@ struct uint3 blockDim; #define __syncthreads() #define asm(x) #define __shfl(a,b,c) 1 +#define __umul64hi(a,b) a*b #endif #define MEMORY (1UL << 20) /* 1 MiB - 1048576 */ diff --git a/crypto/cryptonight-core.cu b/crypto/cryptonight-core.cu index e39b9bc..d43f10e 100644 --- a/crypto/cryptonight-core.cu +++ b/crypto/cryptonight-core.cu @@ -9,10 +9,11 @@ #include "cryptonight.h" #define LONG_SHL_IDX 19 +#define LONG_LOOPS32 0x80000 #ifdef WIN32 // to prevent ui freeze -static __thread int cn_bfactor = 8; +static __thread int cn_bfactor = 11; static __thread int cn_bsleep = 100; #else static __thread int cn_bfactor = 0; @@ -38,25 +39,32 @@ __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t mu __global__ void cryptonight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t __align__(16) sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; - const int sub = (threadIdx.x & 7) << 2; + const int sub = (threadIdx.x & 7) << 2; // 0 4 8 ... 28 if(thread < threads) { - uint32_t key[40], text[4]; + const int oft = thread * 50 + sub + 16; // not aligned 16! + const int long_oft = (thread << LONG_SHL_IDX) + sub; + uint32_t __align__(16) key[40]; + uint32_t __align__(16) text[4]; + + AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft]); + AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft + 2]); - MEMCPY8(key, ctx_key1 + thread * 40, 20); - MEMCPY8(text, ctx_state + thread * 50 + sub + 16, 2); + // copy 160 bytes + #pragma unroll + for (int i = 0; i < 40; i += 4) + AS_UINT4(&key[i]) = AS_UINT4(ctx_key1 + thread * 40 + i); __syncthreads(); - for(int i = 0; i < 0x80000; i += 32) - { + for(int i = 0; i < LONG_LOOPS32; i += 32) { cn_aes_pseudo_round_mut(sharedMemory, text, key); - MEMCPY8(&long_state[(thread << 19) + sub + i], text, 2); + AS_UINT4(&long_state[long_oft + i]) = AS_UINT4(text); } } } @@ -64,7 +72,7 @@ void cryptonight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state __global__ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t __align__(16) sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); @@ -165,32 +173,32 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int long_state[j + sub] = res; } - if(bfactor > 0) - { + if(bfactor > 0) { ctx_a[sub] = a; ctx_b[sub] = b; } } -#else // __CUDA_ARCH__ < 300 +#else const int thread = blockDim.x * blockIdx.x + threadIdx.x; - if(thread < threads) + if (thread < threads) { const int batchsize = ITER >> (2 + bfactor); const int start = partidx * batchsize; const int end = start + batchsize; - const off_t longptr = (off_t) thread << 19; + const off_t longptr = (off_t)thread << LONG_SHL_IDX; uint32_t * long_state = &d_long_state[longptr]; - uint32_t * ctx_a = &d_ctx_a[thread * 4]; - uint32_t * ctx_b = &d_ctx_b[thread * 4]; - uint32_t a[4], b[4]; - MEMCPY8(a, ctx_a, 2); - MEMCPY8(b, ctx_b, 2); + uint64_t * ctx_a = (uint64_t*)(&d_ctx_a[thread * 4]); + uint64_t * ctx_b = (uint64_t*)(&d_ctx_b[thread * 4]); + uint4 A = AS_UINT4(ctx_a); + uint4 B = AS_UINT4(ctx_b); + uint32_t* a = (uint32_t*)&A; + uint32_t* b = (uint32_t*)&B; - for(int i = start; i < end; i++) // end = 262144 + for (int i = start; i < end; i++) // end = 262144 { uint32_t c[4]; uint32_t j = (a[0] >> 2) & E2I_MASK2; @@ -204,43 +212,55 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]); } - if(bfactor > 0) - { - MEMCPY8(ctx_a, a, 2); - MEMCPY8(ctx_b, b, 2); + if (bfactor > 0) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; } } - #endif // __CUDA_ARCH__ >= 300 } __global__ -void cryptonight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2) +void cryptonight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * ctx_state, uint32_t * __restrict__ ctx_key2) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t __align__(16) sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); - int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; - int sub = (threadIdx.x & 7) << 2; + const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + const int sub = (threadIdx.x & 7) << 2; if(thread < threads) { - uint32_t key[40], text[4]; - MEMCPY8(key, d_ctx_key2 + thread * 40, 20); - MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2); + const int long_oft = (thread << LONG_SHL_IDX) + sub; + const int oft = thread * 50 + sub + 16; + + uint32_t __align__(16) key[40]; + uint32_t __align__(8) text[4]; + + // copy 160 bytes + #pragma unroll + for (int i = 0; i < 40; i += 4) + AS_UINT4(&key[i]) = AS_UINT4(ctx_key2 + thread * 40 + i); + + AS_UINT2(&text[0]) = AS_UINT2(&ctx_state[oft+0]); + AS_UINT2(&text[2]) = AS_UINT2(&ctx_state[oft+2]); __syncthreads(); - for(int i = 0; i < 0x80000; i += 32) + for(int i = 0; i < LONG_LOOPS32; i += 32) { + uint32_t __align__(16) st[4]; + AS_UINT4(st) = AS_UINT4(&long_state[long_oft + i]); + #pragma unroll - for(int j = 0; j < 4; ++j) - text[j] ^= long_state[(thread << 19) + sub + i + j]; + for(int j = 0; j < 4; j++) + text[j] ^= st[j]; cn_aes_pseudo_round_mut(sharedMemory, text, key); } - MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2); + AS_UINT2(&ctx_state[oft+0]) = AS_UINT2(&text[0]); + AS_UINT2(&ctx_state[oft+2]) = AS_UINT2(&text[2]); } } diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index db3d4ed..97249fe 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -67,7 +67,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ cudaMalloc(&d_long_state[thr_id], alloc); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput); + cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput); // 200 is aligned 8, not 16 exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index e534bd9..5a4c0cf 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -16,6 +16,7 @@ struct uint3 blockDim; #define __syncthreads() #define asm(x) #define __shfl(a,b,c) 1 +#define __umul64hi(a,b) a*b #endif