From 5be6811dcffbafce70a28e162b6f4609e2e23f41 Mon Sep 17 00:00:00 2001 From: sp-hash Date: Thu, 6 Nov 2014 11:39:53 +0100 Subject: [PATCH] x11: echo and cubehash optimization echo : 40.056ms -> 39.241ms cube : 14.490ms -> 13.511ms cube hash change look like useless (__device__ code in generally inlined) but the reality proves that cuda documentation is wrong... tpruvot: fixed dos lines ending in echo, and used my style for cuda function attributes --- x11/cuda_x11_aes.cu | 104 ++++++++++++------------------------ x11/cuda_x11_cubehash512.cu | 24 +++++---- x11/cuda_x11_echo.cu | 47 ++++++++-------- 3 files changed, 71 insertions(+), 104 deletions(-) diff --git a/x11/cuda_x11_aes.cu b/x11/cuda_x11_aes.cu index fca1b05..1d1ae07 100644 --- a/x11/cuda_x11_aes.cu +++ b/x11/cuda_x11_aes.cu @@ -319,49 +319,32 @@ static void aes_round( uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { - uint32_t idx0, idx1, idx2, idx3; - idx0 = __byte_perm(x0, 0, 0x4440); - idx1 = __byte_perm(x1, 0, 0x4441) + 256; - idx2 = __byte_perm(x2, 0, 0x4442) + 512; - idx3 = __byte_perm(x3, 0, 0x4443) + 768; y0 = xor4_32( - sharedMemory[idx0], - sharedMemory[idx1], - sharedMemory[idx2], - sharedMemory[idx3]); - y0 ^= k0; - - idx0 = __byte_perm(x1, 0, 0x4440); - idx1 = __byte_perm(x2, 0, 0x4441) + 256; - idx2 = __byte_perm(x3, 0, 0x4442) + 512; - idx3 = __byte_perm(x0, 0, 0x4443) + 768; + sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); y1 = xor4_32( - sharedMemory[idx0], - sharedMemory[idx1], - sharedMemory[idx2], - sharedMemory[idx3]); + sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); - idx0 = __byte_perm(x2, 0, 0x4440); - idx1 = __byte_perm(x3, 0, 0x4441) + 256; - idx2 = __byte_perm(x0, 0, 0x4442) + 512; - idx3 = __byte_perm(x1, 0, 0x4443) + 768; y2 = xor4_32( - sharedMemory[idx0], - sharedMemory[idx1], - sharedMemory[idx2], - sharedMemory[idx3]); // ^k2 + sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 + + y0 ^= k0; - idx0 = __byte_perm(x3, 0, 0x4440); - idx1 = __byte_perm(x0, 0, 0x4441) + 256; - idx2 = __byte_perm(x1, 0, 0x4442) + 512; - idx3 = __byte_perm(x2, 0, 0x4443) + 768; y3 = xor4_32( - sharedMemory[idx0], - sharedMemory[idx1], - sharedMemory[idx2], - sharedMemory[idx3]); // ^k3 + sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 } __device__ @@ -370,46 +353,27 @@ static void aes_round( uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { - uint32_t idx0, idx1, idx2, idx3; - - idx0 = __byte_perm(x0, 0, 0x4440); - idx1 = __byte_perm(x1, 0, 0x4441) + 256; - idx2 = __byte_perm(x2, 0, 0x4442) + 512; - idx3 = __byte_perm(x3, 0, 0x4443) + 768; y0 = xor4_32( - sharedMemory[idx0], - sharedMemory[idx1], - sharedMemory[idx2], - sharedMemory[idx3]); - - idx0 = __byte_perm(x1, 0, 0x4440); - idx1 = __byte_perm(x2, 0, 0x4441) + 256; - idx2 = __byte_perm(x3, 0, 0x4442) + 512; - idx3 = __byte_perm(x0, 0, 0x4443) + 768; + sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); y1 = xor4_32( - sharedMemory[idx0], - sharedMemory[idx1], - sharedMemory[idx2], - sharedMemory[idx3]); + sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); - idx0 = __byte_perm(x2, 0, 0x4440); - idx1 = __byte_perm(x3, 0, 0x4441) + 256; - idx2 = __byte_perm(x0, 0, 0x4442) + 512; - idx3 = __byte_perm(x1, 0, 0x4443) + 768; y2 = xor4_32( - sharedMemory[idx0], - sharedMemory[idx1], - sharedMemory[idx2], - sharedMemory[idx3]); // ^k2 + sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 - idx0 = __byte_perm(x3, 0, 0x4440); - idx1 = __byte_perm(x0, 0, 0x4441) + 256; - idx2 = __byte_perm(x1, 0, 0x4442) + 512; - idx3 = __byte_perm(x2, 0, 0x4443) + 768; y3 = xor4_32( - sharedMemory[idx0], - sharedMemory[idx1], - sharedMemory[idx2], - sharedMemory[idx3]); // ^k3 + sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 } diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu index 5f77b0b..cd2ccef 100644 --- a/x11/cuda_x11_cubehash512.cu +++ b/x11/cuda_x11_cubehash512.cu @@ -34,7 +34,8 @@ static const uint32_t c_IV_512[32] = { 0x7795D246, 0xD43E3B44 }; -static __device__ void rrounds(uint32_t x[2][2][2][2][2]) +__device__ __forceinline__ +static void rrounds(uint32_t x[2][2][2][2][2]) { int r; int j; @@ -150,8 +151,8 @@ static __device__ void rrounds(uint32_t x[2][2][2][2][2]) } } - -static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2]) +__device__ __forceinline__ +static void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2]) { int k; int l; @@ -167,7 +168,8 @@ static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2]) x[0][0][k][l][m] ^= *in++; } -static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2]) +__device__ __forceinline__ +static void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2]) { int j; int k; @@ -186,7 +188,8 @@ static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2]) *out++ = x[0][j][k][l][m]; } -void __device__ Init(uint32_t x[2][2][2][2][2]) +__device__ +void Init(uint32_t x[2][2][2][2][2]) { int i,j,k,l,m; #if 0 @@ -227,7 +230,8 @@ void __device__ Init(uint32_t x[2][2][2][2][2]) #endif } -void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) +__device__ __forceinline__ +void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) { /* "xor the block into the first b bytes of the state" */ /* "and then transform the state invertibly through r identical rounds" */ @@ -235,7 +239,8 @@ void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) rrounds(x); } -void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) +__device__ __forceinline__ +void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) { int i; @@ -252,8 +257,9 @@ void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval) /***************************************************/ -// Die Hash-Funktion -__global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +// GPU Hash Function +__global__ +void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index 5f20754..523316a 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -75,41 +75,38 @@ __device__ __forceinline__ void cuda_echo_round( } // Mix Columns -#pragma unroll 4 - for(int i=0;i<4;i++) // Schleife über je 2*uint32_t + #pragma unroll + for (int i = 0; i<4; i++) // Schleife über je 2*uint32_t { -#pragma unroll 4 - for(int j=0;j<4;j++) // Schleife über die elemnte + #pragma unroll 64 + for (int idx = 0; idx<64; idx += 16) // Schleife über die elemnte { - int idx = j<<2; // j*4 - uint32_t a = W[ ((idx + 0)<<2) + i]; - uint32_t b = W[ ((idx + 1)<<2) + i]; - uint32_t c = W[ ((idx + 2)<<2) + i]; - uint32_t d = W[ ((idx + 3)<<2) + i]; + uint32_t a = W[idx + i]; + uint32_t b = W[idx + i + 4]; + uint32_t c = W[idx + i + 8]; + uint32_t d = W[idx + i + 12]; uint32_t ab = a ^ b; uint32_t bc = b ^ c; uint32_t cd = c ^ d; - uint32_t t; - t = ((ab & 0x80808080) >> 7); - uint32_t abx = t<<4 ^ t<<3 ^ t<<1 ^ t; - t = ((bc & 0x80808080) >> 7); - uint32_t bcx = t<<4 ^ t<<3 ^ t<<1 ^ t; - t = ((cd & 0x80808080) >> 7); - uint32_t cdx = t<<4 ^ t<<3 ^ t<<1 ^ t; - - abx ^= ((ab & 0x7F7F7F7F) << 1); - bcx ^= ((bc & 0x7F7F7F7F) << 1); - cdx ^= ((cd & 0x7F7F7F7F) << 1); - - W[ ((idx + 0)<<2) + i] = abx ^ bc ^ d; - W[ ((idx + 1)<<2) + i] = bcx ^ a ^ cd; - W[ ((idx + 2)<<2) + i] = cdx ^ ab ^ d; - W[ ((idx + 3)<<2) + i] = abx ^ bcx ^ cdx ^ ab ^ c; + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + + W[idx + i] = abx ^ bc ^ d; + W[idx + i + 4] = bcx ^ a ^ cd; + W[idx + i + 8] = cdx ^ ab ^ d; + W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; } } + } __global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)