From e18a54e8fc44de65d4b4be3f10daf72d7272aa0d Mon Sep 17 00:00:00 2001 From: sp-hash Date: Sat, 15 Nov 2014 11:59:29 +0100 Subject: [PATCH] sp echo optimisation + cleanup Original Commit : Removed sharedmem and reduced calculations with precalcing (ECHO hash). 750ti + 20KHASH(x11) tpruvot notes: Real change is more of 10 KH/s on stock clocks (but real) launch bounds disabled, no perf increase with 64 registers --- x11/cuda_x11_echo.cu | 364 +++++++++++++++++++++++++++---------------- 1 file changed, 226 insertions(+), 138 deletions(-) diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index 523316a..29505da 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -11,10 +11,10 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t __device__ __forceinline__ void AES_2ROUND( const uint32_t* __restrict__ sharedMemory, uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, - uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3) + uint32_t &k0) { uint32_t y0, y1, y2, y3; - + aes_round(sharedMemory, x0, x1, x2, x3, k0, @@ -28,178 +28,266 @@ __device__ __forceinline__ void AES_2ROUND( k0++; } -__device__ __forceinline__ void cuda_echo_round( - const uint32_t *sharedMemory, - uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3, - uint32_t *W, int round) +__constant__ uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //8-12 + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //21-25 + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, + 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //34-38 + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, + 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + //58-61 +}; + +__device__ __forceinline__ +void cuda_echo_round(const uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ hash) { - // W hat 16*4 als Abmaße - - // Big Sub Words -#pragma unroll 16 - for(int i=0;i<16;i++) + uint32_t k0; + uint32_t h[16]; + #pragma unroll + for (int i = 0; i < 16; i++) { - int idx = i<<2; // *4 - AES_2ROUND(sharedMemory, - W[idx+0], W[idx+1], W[idx+2], W[idx+3], - k0, k1, k2, k3); + h[i] = hash[i]; } - // Shift Rows -#pragma unroll 4 - for(int i=0;i<4;i++) + k0 = 512 + 8; + + #pragma unroll + for (int idx = 0; idx < 16; idx+= 4) { - uint32_t t; - - /// 1, 5, 9, 13 - t = W[4 + i]; - W[4 + i] = W[20 + i]; - W[20 + i] = W[36 + i]; - W[36 + i] = W[52 + i]; - W[52 + i] = t; - - // 2, 6, 10, 14 - t = W[8 + i]; - W[8 + i] = W[40 + i]; - W[40 + i] = t; - t = W[24 + i]; - W[24 + i] = W[56 + i]; - W[56 + i] = t; - - // 15, 11, 7, 3 - t = W[60 + i]; - W[60 + i] = W[44 + i]; - W[44 + i] = W[28 + i]; - W[28 + i] = W[12 + i]; - W[12 + i] = t; + AES_2ROUND(sharedMemory, + h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); } + k0 += 4; - // Mix Columns - #pragma unroll - for (int i = 0; i<4; i++) // Schleife über je 2*uint32_t + uint32_t W[64]; + + #pragma unroll 4 + for (int i = 0; i < 4; i++) { - #pragma unroll 64 - for (int idx = 0; idx<64; idx += 16) // Schleife über die elemnte - { + uint32_t a = P[i]; + uint32_t b = P[i + 4]; + uint32_t c = h[i + 8]; + uint32_t d = P[i + 8]; - uint32_t a = W[idx + i]; - uint32_t b = W[idx + i + 4]; - uint32_t c = W[idx + i + 8]; - uint32_t d = W[idx + i + 12]; + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; - uint32_t ab = a ^ b; - uint32_t bc = b ^ c; - uint32_t cd = c ^ d; - uint32_t t, t2, t3; - t = (ab & 0x80808080); - t2 = (bc & 0x80808080); - t3 = (cd & 0x80808080); + uint32_t t = (ab & 0x80808080); + uint32_t t2 = (bc & 0x80808080); + uint32_t t3 = (cd & 0x80808080); - uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); - uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); - uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); - W[idx + i] = abx ^ bc ^ d; - W[idx + i + 4] = bcx ^ a ^ cd; - W[idx + i + 8] = cdx ^ ab ^ d; - W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; - } - } + W[0 + i] = abx ^ bc ^ d; + W[0 + i + 4] = bcx ^ a ^ cd; + W[0 + i + 8] = cdx ^ ab ^ d; + W[0 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; -} + a = P[12 + i]; + b = h[i + 4]; + c = P[12 + i + 4]; + d = P[12 + i + 8]; -__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) -{ - __shared__ uint32_t sharedMemory[1024]; + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; - aes_gpu_init(sharedMemory); - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; - - uint32_t W[64]; - uint32_t k0 = 512, k1 = 0, k2 = 0, k3 = 0; // K0 = bitlen - /* Initialisierung */ -#pragma unroll 8 - for(int i=0;i<32;i+=4) + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27 ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + + W[16 + i] = abx ^ bc ^ d; + W[16 + i + 4] = bcx ^ a ^ cd; + W[16 + i + 8] = cdx ^ ab ^ d; + W[16 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = h[i]; + b = P[24 + i + 0]; + c = P[24 + i + 4]; + d = P[24 + i + 8]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27 ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + + W[32 + i] = abx ^ bc ^ d; + W[32 + i + 4] = bcx ^ a ^ cd; + W[32 + i + 8] = cdx ^ ab ^ d; + W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[36 + i ]; + b = P[36 + i +4 ]; + c = P[36 + i + 8]; + d = h[i + 12]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27 ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + + W[48 + i] = abx ^ bc ^ d; + W[48 + i + 4] = bcx ^ a ^ cd; + W[48 + i + 8] = cdx ^ ab ^ d; + W[48 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; + + } + + for (int k = 1; k < 10; k++) + { + + // Big Sub Words + #pragma unroll 16 + for (int i = 0; i < 16; i++) { - W[i + 0] = 512; - W[i + 1] = 0; - W[i + 2] = 0; - W[i + 3] = 0; + int idx = i << 2; // *4 + AES_2ROUND(sharedMemory, + W[idx + 0], W[idx + 1], W[idx + 2], W[idx + 3], + k0); } - // kopiere 32-byte großen hash -#pragma unroll 16 - for(int i=0;i<16;i++) - W[i+32] = Hash[i]; - W[48] = 0x80; // fest -#pragma unroll 10 - for(int i=49;i<59;i++) - W[i] = 0; - W[59] = 0x02000000; // fest - W[60] = k0; // bitlen - W[61] = k1; - W[62] = k2; - W[63] = k3; - - for(int i=0;i<10;i++) + // Shift Rows + #pragma unroll 4 + for (int i = 0; i < 4; i++) { - cuda_echo_round(sharedMemory, k0, k1, k2, k3, W, i); + uint32_t t; + + /// 1, 5, 9, 13 + t = W[4 + i]; + W[4 + i] = W[20 + i]; + W[20 + i] = W[36 + i]; + W[36 + i] = W[52 + i]; + W[52 + i] = t; + + // 2, 6, 10, 14 + t = W[8 + i]; + W[8 + i] = W[40 + i]; + W[40 + i] = t; + t = W[24 + i]; + W[24 + i] = W[56 + i]; + W[56 + i] = t; + + // 15, 11, 7, 3 + t = W[60 + i]; + W[60 + i] = W[44 + i]; + W[44 + i] = W[28 + i]; + W[28 + i] = W[12 + i]; + W[12 + i] = t; } -#pragma unroll 8 - for(int i=0;i<32;i+=4) + // Mix Columns + #pragma unroll 4 + for (int i = 0; i < 4; i++) // Schleife über je 2*uint32_t { - W[i ] ^= W[32 + i ] ^ 512; - W[i+1] ^= W[32 + i + 1]; - W[i+2] ^= W[32 + i + 2]; - W[i+3] ^= W[32 + i + 3]; + #pragma unroll 64 + for (int idx = 0; idx < 64; idx += 16) // Schleife über die elemnte + { + uint32_t a = W[idx + i]; + uint32_t b = W[idx + i + 4]; + uint32_t c = W[idx + i + 8]; + uint32_t d = W[idx + i + 12]; + + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; + + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27 ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27 ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27 ^ ((cd^t3) << 1); + + W[idx + i] = abx ^ bc ^ d; + W[idx + i + 4] = bcx ^ a ^ cd; + W[idx + i + 8] = cdx ^ ab ^ d; + W[idx + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; + } } + } -#pragma unroll 16 - for(int i=0;i<16;i++) - W[i] ^= Hash[i]; + #pragma unroll 8 + for (int i = 0; i<32; i += 4) + { + W[i] ^= W[32 + i] ^ 512; + W[i + 1] ^= W[32 + i + 1]; + W[i + 2] ^= W[32 + i + 2]; + W[i + 3] ^= W[32 + i + 3]; + } - // tsiv: I feel iffy about removing this, but it seems to break the full hash - // fortunately for X11 the flipped bit lands outside the first 32 bytes used as the final X11 hash - // try chaining more algos after echo (X13) and boom - //W[8] ^= 0x10; + #pragma unroll + for (int i = 0; i<16; i++) + hash[i] ^= W[i]; +} + +__global__ /* __launch_bounds__(320, 3) will force 64 registers on the 750 Ti */ +void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +{ + __shared__ uint32_t sharedMemory[1024]; + + aes_gpu_init(sharedMemory); - W[27] ^= 0x02000000; - W[28] ^= k0; + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; -#pragma unroll 16 - for(int i=0;i<16;i++) - Hash[i] = W[i]; - } + cuda_echo_round(sharedMemory, Hash); + } } -// Setup-Funktionen -__host__ void x11_echo512_cpu_init(int thr_id, int threads) +__host__ +void x11_echo512_cpu_init(int thr_id, int threads) { aes_cpu_init(); } -__host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +__host__ +void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - const int threadsperblock = 256; - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; + const int threadsperblock = 256; -// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); - x11_echo512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); + x11_echo512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + MyStreamSynchronize(NULL, order, thr_id); }