groestl: small optimisation (nist5 + 100kH on a 750Ti)

But, almost nothing on X15, no big changes...
2014-08-22 19:42:57 +02:00 · 2014-08-22 19:42:57 +02:00 · b3becb67dd
commit b3becb67dd
parent 0867fb15c6
2 changed files with 38 additions and 29 deletions
--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@ -7,14 +7,16 @@
 #define __shfl(var, srcLane, width) (uint32_t)(var)
 #endif

-__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
+__device__ __forceinline__
+void to_bitslice_quad(uint32_t *input, uint32_t *output)
 {
-    int n = threadIdx.x % 4;
    uint32_t other[8];
-#pragma unroll 8
+    const int n = threadIdx.x % 4;
+
+    #pragma unroll
    for (int i = 0; i < 8; i++) {
-        input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
-        other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
+        input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
+        other[i] = __shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
        input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
        other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
        if (threadIdx.x & 1) {
@ -56,6 +58,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
    output[ 0] |= ((other[ 6] & 0x00000100) <<21);
    output[ 0] |= ((input[ 7] & 0x00000100) <<22);
    output[ 0] |= ((other[ 7] & 0x00000100) <<23);
+
    output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
    output[ 1] |=  (other[ 0] & 0x00000002);
    output[ 1] |= ((input[ 1] & 0x00000002) << 1);
@ -88,6 +91,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
    output[ 1] |= ((other[ 6] & 0x00000200) <<20);
    output[ 1] |= ((input[ 7] & 0x00000200) <<21);
    output[ 1] |= ((other[ 7] & 0x00000200) <<22);
+
    output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
    output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
    output[ 2] |=  (input[ 1] & 0x00000004);
@ -282,7 +286,8 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
    output[ 7] |= ((other[ 7] & 0x00008000) <<16);
 }

-__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
+__device__ __forceinline__
+void from_bitslice_quad(uint32_t *input, uint32_t *output)
 {
 #pragma unroll 8
    for (int i=0; i < 16; i+=2) output[i] = 0;
@ -421,6 +426,6 @@ __device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *ou
        if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
        output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
        output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
-        if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
+        if (threadIdx.x % 4) output[i] = output[i+1] = 0;
    }
 }
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -22,7 +22,7 @@ static cudaDeviceProp props[8];
 #include "bitslice_transformations_quad.cu"

 __global__ __launch_bounds__(TPB, THF)
-void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector)
 {
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
    int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
@ -32,18 +32,20 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
        uint32_t message[8];
        uint32_t state[8];

-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
+        uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
        int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[hashPosition<<4];
+        uint32_t *inpHash = &g_hash[hashPosition << 4];

-#pragma unroll 4
-        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
-#pragma unroll 4
+        const uint16_t thr = threadIdx.x % THF;
+
+        #pragma unroll
+        for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
+
+        #pragma unroll
        for(int k=4;k<8;k++) message[k] = 0;

-        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
-        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
+        if (thr == 0) message[4] = 0x80;
+        if (thr == 3) message[7] = 0x01000000;

        uint32_t msgBitsliced[8];
        to_bitslice_quad(message, msgBitsliced);
@ -51,13 +53,13 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
        groestl512_progressMessage_quad(state, msgBitsliced);

        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t *outpHash = &g_hash[hashPosition<<4];
+        uint32_t *outpHash = inpHash;
        uint32_t hash[16];
        from_bitslice_quad(state, hash);

-        if ((threadIdx.x & 0x03) == 0)
+        if (thr == 0)
        {
-#pragma unroll 16
+            #pragma unroll
            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
    }
@ -73,18 +75,20 @@ __global__ void __launch_bounds__(TPB, THF)
        uint32_t message[8];
        uint32_t state[8];

-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+        uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);

        int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[hashPosition<<4];
+        uint32_t * inpHash = &g_hash[hashPosition<<4];
+        const uint16_t thr = threadIdx.x % THF;

-#pragma unroll 4
-        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
-#pragma unroll 4
+        #pragma unroll
+        for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
+
+        #pragma unroll
        for(int k=4;k<8;k++) message[k] = 0;

-        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
-        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
+        if (thr == 0) message[4] = 0x80;
+        if (thr == 3) message[7] = 0x01000000;

        uint32_t msgBitsliced[8];
        to_bitslice_quad(message, msgBitsliced);
@ -108,13 +112,13 @@ __global__ void __launch_bounds__(TPB, THF)
        }

        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        uint32_t *outpHash = &g_hash[hashPosition<<4];
+        uint32_t *outpHash = inpHash;
        uint32_t hash[16];
        from_bitslice_quad(state, hash);

-        if ((threadIdx.x & 0x03) == 0)
+        if (thr == 0)
        {
-#pragma unroll 16
+            #pragma unroll
            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
    }