Browse Source

improve jh512 with vectors (nist5,quark,sib,x11+,zr5)

the main improvement is to reduce asm calls to read global mem

but, a few more regs are used (68 mini vs 64 on SM 5.2)
so reduce the forced launch bounds to allow 80 or 128 regs per thread

Note: cuda 6.5 seems not able to store with v4.u32... (7.5 is fine)
        st.global.v4.u32        [%rd2], {%r3783, %r3824, %r3823, %r3822};
        st.global.v2.u32        [%rd2+16], {%r3821, %r3820};
        st.global.u32   [%rd2+24], %r3819;
        st.global.u32   [%rd2+28], %r3818;
        st.global.u32   [%rd2+44], %r3814;
        st.global.u32   [%rd2+40], %r3815;
        ...

todo, check alexis variant.. but wanted to keep this code before in git...
2upstream
Tanguy Pruvot 8 years ago
parent
commit
b9da6c67f5
  1. 3
      Makefile.am
  2. 75
      quark/cuda_jh512.cu

3
Makefile.am

@ -139,6 +139,9 @@ x17/cuda_x17_sha512.o: x17/cuda_x17_sha512.cu
quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu quark/cuda_quark_blake512.o: quark/cuda_quark_blake512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
quark/cuda_jh512.o: quark/cuda_jh512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu quark/cuda_quark_keccak512.o: quark/cuda_quark_keccak512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=88 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=88 -o $@ -c $<

75
quark/cuda_jh512.cu

@ -1,9 +1,9 @@
#include "cuda_helper.h" #include <cuda_helper.h>
// #include <stdio.h> // printf // #include <stdio.h> // printf
// #include <unistd.h> // sleep // #include <unistd.h> // sleep
/* 1344 bytes */ /* 1344 bytes, align 16 is there to allow ld.const.v4 (made auto. by the compiler) */
__constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = { __constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = {
// Round 0 (Function0) // Round 0 (Function0)
{ 0xa2ded572, 0x90d6ab81, 0x67f815df, 0xf6875a4d, 0x0a15847b, 0xc54f9f4e, 0x571523b7, 0x402bd1c3 }, { 0xa2ded572, 0x90d6ab81, 0x67f815df, 0xf6875a4d, 0x0a15847b, 0xc54f9f4e, 0x571523b7, 0x402bd1c3 },
@ -58,11 +58,11 @@ __constant__ static __align__(16) uint32_t c_E8_bslice32[42][8] = {
/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/ /*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/
//#define SWAP16(x) (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16)); //#define SWAP16(x) (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16));
#define SWAP16(x) (x) = __byte_perm(x, x, 0x1032); #define SWAP16(x) (x) = __byte_perm(x, 0, 0x1032);
/*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/ /*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/
//#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8)); //#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8));
#define SWAP8(x) (x) = __byte_perm(x, x, 0x2301); #define SWAP8(x) (x) = __byte_perm(x, 0, 0x2301);
/* /*
__device__ __forceinline__ __device__ __forceinline__
@ -90,10 +90,9 @@ static void SWAP4x4(uint32_t *x) {
#pragma nounroll #pragma nounroll
// y is used as tmp register too // y is used as tmp register too
for (uint32_t y=0; y<4; y++, ++x) { for (uint32_t y=0; y<4; y++, ++x) {
asm("and.b32 %1, %0, 0xF0F0F0F0;" asm("and.b32 %1, %0, 0xF0F0F0F0;\n\t"
"xor.b32 %0, %0, %1;" "xor.b32 %0, %0, %1; shr.b32 %1, %1, 4;\n\t"
"shr.b32 %1, %1, 4;" "vshl.u32.u32.u32.clamp.add %0, %0, 4, %1;"
"vshl.u32.u32.u32.clamp.add %0, %0, 4, %1;\n\t"
: "+r"(*x) : "r"(y)); : "+r"(*x) : "r"(y));
} }
} }
@ -103,10 +102,9 @@ static void SWAP2x4(uint32_t *x) {
#pragma nounroll #pragma nounroll
// y is used as tmp register too // y is used as tmp register too
for (uint32_t y=0; y<4; y++, ++x) { for (uint32_t y=0; y<4; y++, ++x) {
asm("and.b32 %1, %0, 0xCCCCCCCC;" asm("and.b32 %1, %0, 0xCCCCCCCC;\n\t"
"xor.b32 %0, %0, %1;" "xor.b32 %0, %0, %1; shr.b32 %1, %1, 2; \n\t"
"shr.b32 %1, %1, 2;" "vshl.u32.u32.u32.clamp.add %0, %0, 2, %1;"
"vshl.u32.u32.u32.clamp.add %0, %0, 2, %1;\n\t"
: "+r"(*x) : "r"(y)); : "+r"(*x) : "r"(y));
} }
} }
@ -116,10 +114,9 @@ static void SWAP1x4(uint32_t *x) {
#pragma nounroll #pragma nounroll
// y is used as tmp register too // y is used as tmp register too
for (uint32_t y=0; y<4; y++, ++x) { for (uint32_t y=0; y<4; y++, ++x) {
asm("and.b32 %1, %0, 0xAAAAAAAA;" asm("and.b32 %1, %0, 0xAAAAAAAA;\n\t"
"xor.b32 %0, %0, %1;" "xor.b32 %0, %0, %1; shr.b32 %1, %1, 1; \n\t"
"shr.b32 %1, %1, 1;" "vshl.u32.u32.u32.clamp.add %0, %0, 1, %1;"
"vshl.u32.u32.u32.clamp.add %0, %0, 1, %1;\n\t"
: "+r"(*x) : "r"(y)); : "+r"(*x) : "r"(y));
} }
} }
@ -272,15 +269,23 @@ static void E8(uint32_t x[8][4])
} }
} }
__global__ __launch_bounds__(256, 4) __global__
void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t* g_hash, const uint32_t *const __restrict__ g_nonceVector) //__launch_bounds__(256,2)
void quark_jh512_gpu_hash_64(const uint32_t threads, const uint32_t startNounce, uint32_t* g_hash, uint32_t * g_nonceVector)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
uint32_t hashPosition = nounce - startNounce; const uint32_t hashPosition = nounce - startNounce;
uint32_t *Hash = &g_hash[hashPosition * 16U]; uint32_t *Hash = &g_hash[(size_t)16 * hashPosition];
uint32_t h[16];
AS_UINT4(&h[ 0]) = AS_UINT4(&Hash[ 0]);
AS_UINT4(&h[ 4]) = AS_UINT4(&Hash[ 4]);
AS_UINT4(&h[ 8]) = AS_UINT4(&Hash[ 8]);
AS_UINT4(&h[12]) = AS_UINT4(&Hash[12]);
uint32_t x[8][4] = { /* init */ uint32_t x[8][4] = { /* init */
{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a }, { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a },
{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 }, { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2 },
@ -294,40 +299,26 @@ void quark_jh512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t* g
#pragma unroll #pragma unroll
for (int i = 0; i < 16; i++) for (int i = 0; i < 16; i++)
x[i/4][i & 3] ^= Hash[i]; x[i/4][i & 3] ^= h[i];
E8(x); E8(x);
#pragma unroll #pragma unroll
for (uint8_t i = 0; i < 16; i++) for (int i = 0; i < 16; i++)
x[(i+16)/4][(i+16) & 3] ^= Hash[i]; x[(i+16)/4][(i+16) & 3] ^= h[i];
x[0][0] ^= 0x80U; x[0][0] ^= 0x80U;
x[3][3] ^= 0x00020000U; x[3][3] ^= 0x00020000U;
E8(x); E8(x);
x[4][0] ^= 0x80U; x[4][0] ^= 0x80U;
x[7][3] ^= 0x00020000U; x[7][3] ^= 0x00020000U;
Hash[0] = x[4][0]; AS_UINT4(&Hash[ 0]) = AS_UINT4(&x[4][0]);
Hash[1] = x[4][1]; AS_UINT4(&Hash[ 4]) = AS_UINT4(&x[5][0]);
Hash[2] = x[4][2]; AS_UINT4(&Hash[ 8]) = AS_UINT4(&x[6][0]);
Hash[3] = x[4][3]; AS_UINT4(&Hash[12]) = AS_UINT4(&x[7][0]);
Hash[4] = x[5][0];
Hash[5] = x[5][1];
Hash[6] = x[5][2];
Hash[7] = x[5][3];
Hash[8] = x[6][0];
Hash[9] = x[6][1];
Hash[10] = x[6][2];
Hash[11] = x[6][3];
Hash[12] = x[7][0];
Hash[13] = x[7][1];
Hash[14] = x[7][2];
Hash[15] = x[7][3];
} }
} }

Loading…
Cancel
Save