groestl: small optimisation (nist5 + 100kH on a 750Ti)
But, almost nothing on X15, no big changes...
This commit is contained in:
parent
0867fb15c6
commit
b3becb67dd
@ -7,14 +7,16 @@
|
||||
#define __shfl(var, srcLane, width) (uint32_t)(var)
|
||||
#endif
|
||||
|
||||
__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
|
||||
__device__ __forceinline__
|
||||
void to_bitslice_quad(uint32_t *input, uint32_t *output)
|
||||
{
|
||||
int n = threadIdx.x % 4;
|
||||
uint32_t other[8];
|
||||
#pragma unroll 8
|
||||
const int n = threadIdx.x % 4;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < 8; i++) {
|
||||
input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
|
||||
other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
|
||||
input[i] = __shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
|
||||
other[i] = __shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
|
||||
input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
|
||||
other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
|
||||
if (threadIdx.x & 1) {
|
||||
@ -56,6 +58,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
|
||||
output[ 0] |= ((other[ 6] & 0x00000100) <<21);
|
||||
output[ 0] |= ((input[ 7] & 0x00000100) <<22);
|
||||
output[ 0] |= ((other[ 7] & 0x00000100) <<23);
|
||||
|
||||
output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
|
||||
output[ 1] |= (other[ 0] & 0x00000002);
|
||||
output[ 1] |= ((input[ 1] & 0x00000002) << 1);
|
||||
@ -88,6 +91,7 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
|
||||
output[ 1] |= ((other[ 6] & 0x00000200) <<20);
|
||||
output[ 1] |= ((input[ 7] & 0x00000200) <<21);
|
||||
output[ 1] |= ((other[ 7] & 0x00000200) <<22);
|
||||
|
||||
output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
|
||||
output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
|
||||
output[ 2] |= (input[ 1] & 0x00000004);
|
||||
@ -282,7 +286,8 @@ __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *outp
|
||||
output[ 7] |= ((other[ 7] & 0x00008000) <<16);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
|
||||
__device__ __forceinline__
|
||||
void from_bitslice_quad(uint32_t *input, uint32_t *output)
|
||||
{
|
||||
#pragma unroll 8
|
||||
for (int i=0; i < 16; i+=2) output[i] = 0;
|
||||
@ -421,6 +426,6 @@ __device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *ou
|
||||
if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
|
||||
output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
|
||||
output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
|
||||
if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
|
||||
if (threadIdx.x % 4) output[i] = output[i+1] = 0;
|
||||
}
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ static cudaDeviceProp props[8];
|
||||
#include "bitslice_transformations_quad.cu"
|
||||
|
||||
__global__ __launch_bounds__(TPB, THF)
|
||||
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
|
||||
void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t * __restrict g_hash, uint32_t * __restrict g_nonceVector)
|
||||
{
|
||||
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
|
||||
@ -32,18 +32,20 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
|
||||
uint32_t message[8];
|
||||
uint32_t state[8];
|
||||
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
|
||||
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *inpHash = &g_hash[hashPosition<<4];
|
||||
uint32_t *inpHash = &g_hash[hashPosition << 4];
|
||||
|
||||
#pragma unroll 4
|
||||
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
|
||||
#pragma unroll 4
|
||||
const uint16_t thr = threadIdx.x % THF;
|
||||
|
||||
#pragma unroll
|
||||
for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
|
||||
|
||||
#pragma unroll
|
||||
for(int k=4;k<8;k++) message[k] = 0;
|
||||
|
||||
if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
|
||||
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
|
||||
if (thr == 0) message[4] = 0x80;
|
||||
if (thr == 3) message[7] = 0x01000000;
|
||||
|
||||
uint32_t msgBitsliced[8];
|
||||
to_bitslice_quad(message, msgBitsliced);
|
||||
@ -51,13 +53,13 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
|
||||
groestl512_progressMessage_quad(state, msgBitsliced);
|
||||
|
||||
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
|
||||
uint32_t *outpHash = &g_hash[hashPosition<<4];
|
||||
uint32_t *outpHash = inpHash;
|
||||
uint32_t hash[16];
|
||||
from_bitslice_quad(state, hash);
|
||||
|
||||
if ((threadIdx.x & 0x03) == 0)
|
||||
if (thr == 0)
|
||||
{
|
||||
#pragma unroll 16
|
||||
#pragma unroll
|
||||
for(int k=0;k<16;k++) outpHash[k] = hash[k];
|
||||
}
|
||||
}
|
||||
@ -73,18 +75,20 @@ __global__ void __launch_bounds__(TPB, THF)
|
||||
uint32_t message[8];
|
||||
uint32_t state[8];
|
||||
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread);
|
||||
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *inpHash = &g_hash[hashPosition<<4];
|
||||
uint32_t * inpHash = &g_hash[hashPosition<<4];
|
||||
const uint16_t thr = threadIdx.x % THF;
|
||||
|
||||
#pragma unroll 4
|
||||
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
|
||||
#pragma unroll 4
|
||||
#pragma unroll
|
||||
for(int k=0;k<4;k++) message[k] = inpHash[(k * THF) + thr];
|
||||
|
||||
#pragma unroll
|
||||
for(int k=4;k<8;k++) message[k] = 0;
|
||||
|
||||
if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
|
||||
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
|
||||
if (thr == 0) message[4] = 0x80;
|
||||
if (thr == 3) message[7] = 0x01000000;
|
||||
|
||||
uint32_t msgBitsliced[8];
|
||||
to_bitslice_quad(message, msgBitsliced);
|
||||
@ -108,13 +112,13 @@ __global__ void __launch_bounds__(TPB, THF)
|
||||
}
|
||||
|
||||
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
|
||||
uint32_t *outpHash = &g_hash[hashPosition<<4];
|
||||
uint32_t *outpHash = inpHash;
|
||||
uint32_t hash[16];
|
||||
from_bitslice_quad(state, hash);
|
||||
|
||||
if ((threadIdx.x & 0x03) == 0)
|
||||
if (thr == 0)
|
||||
{
|
||||
#pragma unroll 16
|
||||
#pragma unroll
|
||||
for(int k=0;k<16;k++) outpHash[k] = hash[k];
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user