groestl: small optimisation (nist5 + 100kH on a 750Ti)

But, almost nothing on X15, no big changes...
This commit is contained in:
Tanguy Pruvot 2014-08-22 19:42:57 +02:00
parent 0867fb15c6
commit b3becb67dd
2 changed files with 38 additions and 29 deletions

View File

@ -7,14 +7,16 @@
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
__device__ __forceinline__
void to_bitslice_quad(uint32_t *input, uint32_t *output)
{
int n = threadIdx.x % 4;
uint32_t other[8];
#pragma unroll 8
const int n = threadIdx.x % 4;
#pragma unroll
for (int i = 0; i < 8; i++) {
input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
other[i] = __shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
if (threadIdx.x & 1) {
@ -56,6 +58,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
output[ 0] |= ((other[ 6] & 0x00000100) <<21);
output[ 0] |= ((input[ 7] & 0x00000100) <<22);
output[ 0] |= ((other[ 7] & 0x00000100) <<23);
output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
output[ 1] |= (other[ 0] & 0x00000002);
output[ 1] |= ((input[ 1] & 0x00000002) << 1);
@ -88,6 +91,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
output[ 1] |= ((other[ 6] & 0x00000200) <<20);
output[ 1] |= ((input[ 7] & 0x00000200) <<21);
output[ 1] |= ((other[ 7] & 0x00000200) <<22);
output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
output[ 2] |= (input[ 1] & 0x00000004);
@ -282,7 +286,8 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
output[ 7] |= ((other[ 7] & 0x00008000) <<16);
}
__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
__device__ __forceinline__
void from_bitslice_quad(uint32_t *input, uint32_t *output)
{
#pragma unroll 8
for (int i=0; i < 16; i+=2) output[i] = 0;
@ -421,6 +426,6 @@ __device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *ou
if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
if (threadIdx.x % 4) output[i] = output[i+1] = 0;
}
}

View File

@ -22,7 +22,7 @@ static cudaDeviceProp props[8];
#include "bitslice_transformations_quad.cu"
__global__ __launch_bounds__(TPB, THF)
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
@ -32,18 +32,20 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
uint32_t message[8];
uint32_t state[8];
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *inpHash = &g_hash[hashPosition<<4];
uint32_t *inpHash = &g_hash[hashPosition << 4];
#pragma unroll 4
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
#pragma unroll 4
const uint16_t thr = threadIdx.x % THF;
#pragma unroll
for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
#pragma unroll
for(int k=4;k<8;k++) message[k] = 0;
if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
if (thr == 0) message[4] = 0x80;
if (thr == 3) message[7] = 0x01000000;
uint32_t msgBitsliced[8];
to_bitslice_quad(message, msgBitsliced);
@ -51,13 +53,13 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
groestl512_progressMessage_quad(state, msgBitsliced);
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
uint32_t *outpHash = &g_hash[hashPosition<<4];
uint32_t *outpHash = inpHash;
uint32_t hash[16];
from_bitslice_quad(state, hash);
if ((threadIdx.x & 0x03) == 0)
if (thr == 0)
{
#pragma unroll 16
#pragma unroll
for(int k=0;k<16;k++) outpHash[k] = hash[k];
}
}
@ -73,18 +75,20 @@ __global__ void __launch_bounds__(TPB, THF)
uint32_t message[8];
uint32_t state[8];
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *inpHash = &g_hash[hashPosition<<4];
uint32_t * inpHash = &g_hash[hashPosition<<4];
const uint16_t thr = threadIdx.x % THF;
#pragma unroll 4
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
#pragma unroll 4
#pragma unroll
for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
#pragma unroll
for(int k=4;k<8;k++) message[k] = 0;
if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
if (thr == 0) message[4] = 0x80;
if (thr == 3) message[7] = 0x01000000;
uint32_t msgBitsliced[8];
to_bitslice_quad(message, msgBitsliced);
@ -108,13 +112,13 @@ __global__ void __launch_bounds__(TPB, THF)
}
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
uint32_t *outpHash = &g_hash[hashPosition<<4];
uint32_t *outpHash = inpHash;
uint32_t hash[16];
from_bitslice_quad(state, hash);
if ((threadIdx.x & 0x03) == 0)
if (thr == 0)
{
#pragma unroll 16
#pragma unroll
for(int k=0;k<16;k++) outpHash[k] = hash[k];
}
}