From b3becb67ddc384c2fbf6928b6df9f6fe147cfdb6 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 22 Aug 2014 19:42:57 +0200 Subject: [PATCH] groestl: small optimisation (nist5 + 100kH on a 750Ti) But, almost nothing on X15, no big changes... --- bitslice_transformations_quad.cu | 19 ++++++++----- quark/cuda_quark_groestl512.cu | 48 +++++++++++++++++--------------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu index 63ce6e2..8f7bcf8 100644 --- a/bitslice_transformations_quad.cu +++ b/bitslice_transformations_quad.cu @@ -7,14 +7,16 @@ #define __shfl(var, srcLane, width) (uint32_t)(var) #endif -__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output) +__device__ __forceinline__ +void to_bitslice_quad(uint32_t *input, uint32_t *output) { - int n = threadIdx.x % 4; uint32_t other[8]; -#pragma unroll 8 + const int n = threadIdx.x % 4; + + #pragma unroll for (int i = 0; i < 8; i++) { - input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4); - other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4); + input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4); + other[i] = __shfl((int)input[i], (threadIdx.x + 1) % 4, 4); input[i] = __shfl((int)input[i], threadIdx.x & 2, 4); other[i] = __shfl((int)other[i], threadIdx.x & 2, 4); if (threadIdx.x & 1) { @@ -56,6 +58,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp output[ 0] |= ((other[ 6] & 0x00000100) <<21); output[ 0] |= ((input[ 7] & 0x00000100) <<22); output[ 0] |= ((other[ 7] & 0x00000100) <<23); + output[ 1] |= ((input[ 0] & 0x00000002) >> 1); output[ 1] |= (other[ 0] & 0x00000002); output[ 1] |= ((input[ 1] & 0x00000002) << 1); @@ -88,6 +91,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp output[ 1] |= ((other[ 6] & 0x00000200) <<20); output[ 1] |= ((input[ 7] & 0x00000200) <<21); output[ 1] |= ((other[ 7] & 0x00000200) <<22); + output[ 2] |= ((input[ 0] & 0x00000004) >> 2); output[ 2] |= ((other[ 0] & 0x00000004) >> 1); output[ 2] |= (input[ 1] & 0x00000004); @@ -282,7 +286,8 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp output[ 7] |= ((other[ 7] & 0x00008000) <<16); } -__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output) +__device__ __forceinline__ +void from_bitslice_quad(uint32_t *input, uint32_t *output) { #pragma unroll 8 for (int i=0; i < 16; i+=2) output[i] = 0; @@ -421,6 +426,6 @@ __device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *ou if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032); output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610); output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4); - if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0; + if (threadIdx.x % 4) output[i] = output[i+1] = 0; } } diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu index 90abb1b..9f39517 100644 --- a/quark/cuda_quark_groestl512.cu +++ b/quark/cuda_quark_groestl512.cu @@ -22,7 +22,7 @@ static cudaDeviceProp props[8]; #include "bitslice_transformations_quad.cu" __global__ __launch_bounds__(TPB, THF) -void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) +void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector) { // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; @@ -32,18 +32,20 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32 uint32_t message[8]; uint32_t state[8]; - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - + uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[hashPosition<<4]; + uint32_t *inpHash = &g_hash[hashPosition << 4]; + + const uint16_t thr = threadIdx.x % THF; -#pragma unroll 4 - for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)]; -#pragma unroll 4 + #pragma unroll + for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr]; + + #pragma unroll for(int k=4;k<8;k++) message[k] = 0; - if ((threadIdx.x&0x03) == 0) message[4] = 0x80; - if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000; + if (thr == 0) message[4] = 0x80; + if (thr == 3) message[7] = 0x01000000; uint32_t msgBitsliced[8]; to_bitslice_quad(message, msgBitsliced); @@ -51,13 +53,13 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32 groestl512_progressMessage_quad(state, msgBitsliced); // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash - uint32_t *outpHash = &g_hash[hashPosition<<4]; + uint32_t *outpHash = inpHash; uint32_t hash[16]; from_bitslice_quad(state, hash); - if ((threadIdx.x & 0x03) == 0) + if (thr == 0) { -#pragma unroll 16 + #pragma unroll for(int k=0;k<16;k++) outpHash[k] = hash[k]; } } @@ -73,18 +75,20 @@ __global__ void __launch_bounds__(TPB, THF) uint32_t message[8]; uint32_t state[8]; - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[hashPosition<<4]; + uint32_t * inpHash = &g_hash[hashPosition<<4]; + const uint16_t thr = threadIdx.x % THF; + + #pragma unroll + for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr]; -#pragma unroll 4 - for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)]; -#pragma unroll 4 + #pragma unroll for(int k=4;k<8;k++) message[k] = 0; - if ((threadIdx.x&0x03) == 0) message[4] = 0x80; - if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000; + if (thr == 0) message[4] = 0x80; + if (thr == 3) message[7] = 0x01000000; uint32_t msgBitsliced[8]; to_bitslice_quad(message, msgBitsliced); @@ -108,13 +112,13 @@ __global__ void __launch_bounds__(TPB, THF) } // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash - uint32_t *outpHash = &g_hash[hashPosition<<4]; + uint32_t *outpHash = inpHash; uint32_t hash[16]; from_bitslice_quad(state, hash); - if ((threadIdx.x & 0x03) == 0) + if (thr == 0) { -#pragma unroll 16 + #pragma unroll for(int k=0;k<16;k++) outpHash[k] = hash[k]; } }