Browse Source

x11: small simd512 gpu_expand improvement

Tanguy Pruvot 10 years ago
parent
commit
66d2c5202b
  1. 1
      quark/cuda_skein512.cu
  2. 4
      x11/cuda_x11_cubehash512.cu
  3. 77
      x11/cuda_x11_simd512.cu

1
quark/cuda_skein512.cu

@ -549,7 +549,6 @@ void quark_skein512_gpu_hash_64_v30(int threads, uint32_t startNounce, uint64_t @@ -549,7 +549,6 @@ void quark_skein512_gpu_hash_64_v30(int threads, uint32_t startNounce, uint64_t
}
}
// Setup-Funktionen
__host__
void quark_skein512_cpu_init(int thr_id, int threads)
{

4
x11/cuda_x11_cubehash512.cu

@ -228,7 +228,7 @@ void Init(uint32_t x[2][2][2][2][2]) @@ -228,7 +228,7 @@ void Init(uint32_t x[2][2][2][2][2])
}
__device__ __forceinline__
void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
static void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
{
/* "xor the block into the first b bytes of the state" */
/* "and then transform the state invertibly through r identical rounds" */
@ -237,7 +237,7 @@ void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data) @@ -237,7 +237,7 @@ void Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
}
__device__ __forceinline__
void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
static void Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
{
int i;

77
x11/cuda_x11_simd512.cu

@ -16,8 +16,14 @@ uint4 *d_temp4[8]; @@ -16,8 +16,14 @@ uint4 *d_temp4[8];
// texture bound to d_temp4[thr_id], for read access in Compaction kernel
texture<uint4, 1, cudaReadModeElementType> texRef1D_128;
#define DEVICE_DIRECT_CONSTANTS
#ifdef DEVICE_DIRECT_CONSTANTS
__constant__ uint8_t c_perm[8][8] = {
#else
__constant__ uint8_t c_perm[8][8];
const uint8_t h_perm[8][8] = {
#endif
{ 2, 3, 6, 7, 0, 1, 4, 5 },
{ 6, 7, 2, 3, 4, 5, 0, 1 },
{ 7, 6, 5, 4, 3, 2, 1, 0 },
@ -28,16 +34,25 @@ const uint8_t h_perm[8][8] = { @@ -28,16 +34,25 @@ const uint8_t h_perm[8][8] = {
{ 4, 5, 2, 3, 6, 7, 0, 1 }
};
/* for simd_functions.cu */
#ifdef DEVICE_DIRECT_CONSTANTS
__constant__ uint32_t c_IV_512[32] = {
#else
__constant__ uint32_t c_IV_512[32];
const uint32_t h_IV_512[32] = {
#endif
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
};
#ifdef DEVICE_DIRECT_CONSTANTS
__constant__ short c_FFT128_8_16_Twiddle[128] = {
#else
__constant__ short c_FFT128_8_16_Twiddle[128];
static const short h_FFT128_8_16_Twiddle[128] = {
#endif
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
@ -48,8 +63,12 @@ static const short h_FFT128_8_16_Twiddle[128] = { @@ -48,8 +63,12 @@ static const short h_FFT128_8_16_Twiddle[128] = {
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79
};
#ifdef DEVICE_DIRECT_CONSTANTS
__constant__ short c_FFT256_2_128_Twiddle[128] = {
#else
__constant__ short c_FFT256_2_128_Twiddle[128];
static const short h_FFT256_2_128_Twiddle[128] = {
#endif
1, 41,-118, 45, 46, 87, -31, 14,
60,-110, 116,-127, -67, 80, -61, 69,
2, 82, 21, 90, 92, -83, -62, 28,
@ -100,15 +119,14 @@ static const short h_FFT256_2_128_Twiddle[128] = { @@ -100,15 +119,14 @@ static const short h_FFT256_2_128_Twiddle[128] = {
#define REDUCE_FULL_S(x) \
EXTRA_REDUCE_S(REDUCE(x))
__device__ __forceinline__
void FFT_8(int *y, int stripe) {
/*
/**
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
__device__ __forceinline__
void FFT_8(int *y, int stripe)
{
#define X(i) y[stripe*i]
#define DO_REDUCE(i) \
@ -163,13 +181,14 @@ do { \ @@ -163,13 +181,14 @@ do { \
#undef BUTTERFLY
}
__device__ __forceinline__ void FFT_16(int *y) {
/**
* FFT_16 using w=2 as 16th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
__device__ __forceinline__
void FFT_16(int *y)
{
#define DO_REDUCE_FULL_S(i) \
do { \
y[i] = REDUCE(y[i]); \
@ -550,24 +569,24 @@ void Expansion(const uint32_t *data, uint4 *g_temp4) @@ -550,24 +569,24 @@ void Expansion(const uint32_t *data, uint4 *g_temp4)
}
/***************************************************/
__global__ void __launch_bounds__(TPB, 8)
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
__global__ __launch_bounds__(TPB*2, 8)
void x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x)/8;
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 8;
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
//uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
//int hashPosition = nounce - startNounce;
int hashPosition = thread;
uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
// Hash einlesen und auf 8 Threads und 2 Register verteilen
uint32_t Hash[2];
#pragma unroll 2
for (int i=0; i<2; i++)
Hash[i] = inpHash[8*i + (threadIdx.x & 7)];
int ndx = threadIdx.x & 7;
Hash[0] = inpHash[ndx];
Hash[1] = inpHash[ndx + 8];
// Puffer für expandierte Nachricht
uint4 *temp4 = &g_temp4[64 * hashPosition];
@ -577,8 +596,8 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u @@ -577,8 +596,8 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u
}
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
__global__ __launch_bounds__(TPB, 4)
void x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -591,8 +610,9 @@ x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash @@ -591,8 +610,9 @@ x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash
Compression1(Hash, hashPosition, g_fft4, g_state);
}
}
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
__global__ __launch_bounds__(TPB, 4)
void x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -605,9 +625,8 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash @@ -605,9 +625,8 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash
}
}
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
__global__ __launch_bounds__(TPB, 4)
void x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -622,9 +641,8 @@ x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t @@ -622,9 +641,8 @@ x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t
}
}
__global__ void __launch_bounds__(TPB, 4)
x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
__global__ __launch_bounds__(TPB, 4) /* 64, 12 seems ok */
void x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -643,12 +661,12 @@ int x11_simd512_cpu_init(int thr_id, int threads) @@ -643,12 +661,12 @@ int x11_simd512_cpu_init(int thr_id, int threads)
{
CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
#ifndef DEVICE_DIRECT_CONSTANTS
cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
#endif
cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
@ -659,6 +677,7 @@ int x11_simd512_cpu_init(int thr_id, int threads) @@ -659,6 +677,7 @@ int x11_simd512_cpu_init(int thr_id, int threads)
texRef1D_128.normalized = 0;
texRef1D_128.filterMode = cudaFilterModePoint;
texRef1D_128.addressMode[0] = cudaAddressModeClamp;
CUDA_CALL_OR_RET_X(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads), (int) err);
return 0;

Loading…
Cancel
Save