groestl: small optimisation (nist5 + 100kH on a 750Ti)

But, almost nothing on X15, no big changes...
This commit is contained in:
Tanguy Pruvot 2014-08-22 19:42:57 +02:00
parent 0867fb15c6
commit b3becb67dd
2 changed files with 38 additions and 29 deletions

View File

@ -7,14 +7,16 @@
#define __shfl(var, srcLane, width) (uint32_t)(var) #define __shfl(var, srcLane, width) (uint32_t)(var)
#endif #endif
__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output) __device__ __forceinline__
void to_bitslice_quad(uint32_t *input, uint32_t *output)
{ {
int n = threadIdx.x % 4;
uint32_t other[8]; uint32_t other[8];
#pragma unroll 8 const int n = threadIdx.x % 4;
#pragma unroll
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4); input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4); other[i] = __shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
input[i] = __shfl((int)input[i], threadIdx.x & 2, 4); input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
other[i] = __shfl((int)other[i], threadIdx.x & 2, 4); other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
if (threadIdx.x & 1) { if (threadIdx.x & 1) {
@ -56,6 +58,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
output[ 0] |= ((other[ 6] & 0x00000100) <<21); output[ 0] |= ((other[ 6] & 0x00000100) <<21);
output[ 0] |= ((input[ 7] & 0x00000100) <<22); output[ 0] |= ((input[ 7] & 0x00000100) <<22);
output[ 0] |= ((other[ 7] & 0x00000100) <<23); output[ 0] |= ((other[ 7] & 0x00000100) <<23);
output[ 1] |= ((input[ 0] & 0x00000002) >> 1); output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
output[ 1] |= (other[ 0] & 0x00000002); output[ 1] |= (other[ 0] & 0x00000002);
output[ 1] |= ((input[ 1] & 0x00000002) << 1); output[ 1] |= ((input[ 1] & 0x00000002) << 1);
@ -88,6 +91,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
output[ 1] |= ((other[ 6] & 0x00000200) <<20); output[ 1] |= ((other[ 6] & 0x00000200) <<20);
output[ 1] |= ((input[ 7] & 0x00000200) <<21); output[ 1] |= ((input[ 7] & 0x00000200) <<21);
output[ 1] |= ((other[ 7] & 0x00000200) <<22); output[ 1] |= ((other[ 7] & 0x00000200) <<22);
output[ 2] |= ((input[ 0] & 0x00000004) >> 2); output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
output[ 2] |= ((other[ 0] & 0x00000004) >> 1); output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
output[ 2] |= (input[ 1] & 0x00000004); output[ 2] |= (input[ 1] & 0x00000004);
@ -282,7 +286,8 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
output[ 7] |= ((other[ 7] & 0x00008000) <<16); output[ 7] |= ((other[ 7] & 0x00008000) <<16);
} }
__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output) __device__ __forceinline__
void from_bitslice_quad(uint32_t *input, uint32_t *output)
{ {
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 16; i+=2) output[i] = 0; for (int i=0; i < 16; i+=2) output[i] = 0;
@ -421,6 +426,6 @@ __device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *ou
if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032); if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610); output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4); output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0; if (threadIdx.x % 4) output[i] = output[i+1] = 0;
} }
} }

View File

@ -22,7 +22,7 @@ static cudaDeviceProp props[8];
#include "bitslice_transformations_quad.cu" #include "bitslice_transformations_quad.cu"
__global__ __launch_bounds__(TPB, THF) __global__ __launch_bounds__(TPB, THF)
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector) void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector)
{ {
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
@ -32,18 +32,20 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
uint32_t message[8]; uint32_t message[8];
uint32_t state[8]; uint32_t state[8];
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint32_t *inpHash = &g_hash[hashPosition<<4]; uint32_t *inpHash = &g_hash[hashPosition << 4];
#pragma unroll 4 const uint16_t thr = threadIdx.x % THF;
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
#pragma unroll 4 #pragma unroll
for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
#pragma unroll
for(int k=4;k<8;k++) message[k] = 0; for(int k=4;k<8;k++) message[k] = 0;
if ((threadIdx.x&0x03) == 0) message[4] = 0x80; if (thr == 0) message[4] = 0x80;
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000; if (thr == 3) message[7] = 0x01000000;
uint32_t msgBitsliced[8]; uint32_t msgBitsliced[8];
to_bitslice_quad(message, msgBitsliced); to_bitslice_quad(message, msgBitsliced);
@ -51,13 +53,13 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
groestl512_progressMessage_quad(state, msgBitsliced); groestl512_progressMessage_quad(state, msgBitsliced);
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
uint32_t *outpHash = &g_hash[hashPosition<<4]; uint32_t *outpHash = inpHash;
uint32_t hash[16]; uint32_t hash[16];
from_bitslice_quad(state, hash); from_bitslice_quad(state, hash);
if ((threadIdx.x & 0x03) == 0) if (thr == 0)
{ {
#pragma unroll 16 #pragma unroll
for(int k=0;k<16;k++) outpHash[k] = hash[k]; for(int k=0;k<16;k++) outpHash[k] = hash[k];
} }
} }
@ -73,18 +75,20 @@ __global__ void __launch_bounds__(TPB, THF)
uint32_t message[8]; uint32_t message[8];
uint32_t state[8]; uint32_t state[8];
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint32_t *inpHash = &g_hash[hashPosition<<4]; uint32_t * inpHash = &g_hash[hashPosition<<4];
const uint16_t thr = threadIdx.x % THF;
#pragma unroll 4 #pragma unroll
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)]; for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
#pragma unroll 4
#pragma unroll
for(int k=4;k<8;k++) message[k] = 0; for(int k=4;k<8;k++) message[k] = 0;
if ((threadIdx.x&0x03) == 0) message[4] = 0x80; if (thr == 0) message[4] = 0x80;
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000; if (thr == 3) message[7] = 0x01000000;
uint32_t msgBitsliced[8]; uint32_t msgBitsliced[8];
to_bitslice_quad(message, msgBitsliced); to_bitslice_quad(message, msgBitsliced);
@ -108,13 +112,13 @@ __global__ void __launch_bounds__(TPB, THF)
} }
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
uint32_t *outpHash = &g_hash[hashPosition<<4]; uint32_t *outpHash = inpHash;
uint32_t hash[16]; uint32_t hash[16];
from_bitslice_quad(state, hash); from_bitslice_quad(state, hash);
if ((threadIdx.x & 0x03) == 0) if (thr == 0)
{ {
#pragma unroll 16 #pragma unroll
for(int k=0;k<16;k++) outpHash[k] = hash[k]; for(int k=0;k<16;k++) outpHash[k] = hash[k];
} }
} }