mirror of
https://github.com/GOSTSec/ccminer
synced 2025-08-30 07:42:11 +00:00
simd: then reindent the code
no changes, only error checks (cuda safe call)
This commit is contained in:
parent
b465fe6825
commit
93f4409dde
@ -1,13 +1,14 @@
|
|||||||
// Parallelisierung:
|
// Parallelization:
|
||||||
//
|
//
|
||||||
// FFT_8 wird 2 mal 8-fach parallel ausgeführt (in FFT_64)
|
// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64)
|
||||||
// und 1 mal 16-fach parallel (in FFT_128_full)
|
// and 1 time 16-fach parallel (in FFT_128_full)
|
||||||
//
|
//
|
||||||
// STEP8_IF und STEP8_MAJ beinhalten je zwei 8-fach parallele Operationen
|
// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
|
||||||
|
|
||||||
#define TPB 64
|
#define TPB 64
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// aus heavy.cu
|
// aus heavy.cu
|
||||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||||
@ -76,6 +77,7 @@ static const int h_FFT256_2_128_Twiddle[128] = {
|
|||||||
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
|
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
|
||||||
|
|
||||||
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
|
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
|
||||||
|
|
||||||
#include "x11/simd_functions.cu"
|
#include "x11/simd_functions.cu"
|
||||||
|
|
||||||
/********************* Message expansion ************************/
|
/********************* Message expansion ************************/
|
||||||
@ -84,7 +86,8 @@ static const int h_FFT256_2_128_Twiddle[128] = {
|
|||||||
* Reduce modulo 257; result is in [-127; 383]
|
* Reduce modulo 257; result is in [-127; 383]
|
||||||
* REDUCE(x) := (x&255) - (x>>8)
|
* REDUCE(x) := (x&255) - (x>>8)
|
||||||
*/
|
*/
|
||||||
#define REDUCE(x) (((x)&255) - ((x)>>8))
|
#define REDUCE(x) \
|
||||||
|
(((x)&255) - ((x)>>8))
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Reduce from [-127; 383] to [-128; 128]
|
* Reduce from [-127; 383] to [-128; 128]
|
||||||
@ -99,7 +102,8 @@ static const int h_FFT256_2_128_Twiddle[128] = {
|
|||||||
#define REDUCE_FULL_S(x) \
|
#define REDUCE_FULL_S(x) \
|
||||||
EXTRA_REDUCE_S(REDUCE(x))
|
EXTRA_REDUCE_S(REDUCE(x))
|
||||||
|
|
||||||
__device__ __forceinline__ void FFT_8(int *y, int stripe) {
|
__device__ __forceinline__
|
||||||
|
void FFT_8(int *y, int stripe) {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* FFT_8 using w=4 as 8th root of unity
|
* FFT_8 using w=4 as 8th root of unity
|
||||||
@ -163,12 +167,11 @@ X(j) = (u-v) << (2*n); \
|
|||||||
|
|
||||||
__device__ __forceinline__ void FFT_16(int *y) {
|
__device__ __forceinline__ void FFT_16(int *y) {
|
||||||
|
|
||||||
/*
|
/**
|
||||||
* FFT_16 using w=2 as 16th root of unity
|
* FFT_16 using w=2 as 16th root of unity
|
||||||
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||||
* Output data is in revbin_permuted order.
|
* Output data is in revbin_permuted order.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define DO_REDUCE_FULL_S(i) \
|
#define DO_REDUCE_FULL_S(i) \
|
||||||
do { \
|
do { \
|
||||||
y[i] = REDUCE(y[i]); \
|
y[i] = REDUCE(y[i]); \
|
||||||
@ -274,7 +277,9 @@ y[i] = EXTRA_REDUCE_S(y[i]); \
|
|||||||
#undef DO_REDUCE_FULL_S
|
#undef DO_REDUCE_FULL_S
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__ void FFT_128_full(int y[128]) {
|
__device__ __forceinline__
|
||||||
|
void FFT_128_full(int y[128])
|
||||||
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
FFT_8(y+0,2); // eight parallel FFT8's
|
FFT_8(y+0,2); // eight parallel FFT8's
|
||||||
@ -289,11 +294,9 @@ __device__ __forceinline__ void FFT_128_full(int y[128]) {
|
|||||||
FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads
|
FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__
|
||||||
__device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
|
void FFT_256_halfzero(int y[256])
|
||||||
|
{
|
||||||
int i;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* FFT_256 using w=41 as 256th root of unity.
|
* FFT_256 using w=41 as 256th root of unity.
|
||||||
* Decimation in frequency (DIF) NTT.
|
* Decimation in frequency (DIF) NTT.
|
||||||
@ -303,10 +306,10 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
|
|||||||
const int tmp = y[15];
|
const int tmp = y[15];
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (i=0; i<8; i++)
|
for (int i=0; i<8; i++)
|
||||||
y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]);
|
y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]);
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (i=8; i<16; i++)
|
for (int i=8; i<16; i++)
|
||||||
y[16+i] = 0;
|
y[16+i] = 0;
|
||||||
|
|
||||||
/* handle X^255 with an additional butterfly */
|
/* handle X^255 with an additional butterfly */
|
||||||
@ -323,19 +326,18 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
|
|||||||
|
|
||||||
/***************************************************/
|
/***************************************************/
|
||||||
|
|
||||||
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
|
__device__ __forceinline__
|
||||||
|
void Expansion(const uint32_t *data, uint4 *g_temp4)
|
||||||
{
|
{
|
||||||
int i;
|
|
||||||
|
|
||||||
/* Message Expansion using Number Theoretical Transform similar to FFT */
|
/* Message Expansion using Number Theoretical Transform similar to FFT */
|
||||||
int expanded[32];
|
int expanded[32];
|
||||||
#pragma unroll 4
|
#pragma unroll 4
|
||||||
for (i=0; i < 4; i++) {
|
for (int i=0; i < 4; i++) {
|
||||||
expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff;
|
expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff;
|
||||||
expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff;
|
expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff;
|
||||||
}
|
}
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (i=8; i < 16; i++)
|
for (int i=8; i < 16; i++)
|
||||||
expanded[i] = 0;
|
expanded[i] = 0;
|
||||||
|
|
||||||
FFT_256_halfzero(expanded);
|
FFT_256_halfzero(expanded);
|
||||||
@ -447,7 +449,6 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
|
|||||||
//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 },
|
//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 },
|
||||||
//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 },
|
//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 },
|
||||||
|
|
||||||
|
|
||||||
bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5
|
bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5
|
||||||
|
|
||||||
P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8);
|
P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8);
|
||||||
@ -474,7 +475,6 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
|
|||||||
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
|
// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
|
||||||
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5
|
// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5
|
||||||
|
|
||||||
|
|
||||||
P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8);
|
P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8);
|
||||||
Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8);
|
Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8);
|
||||||
P = even? P1 : P2; Q = even? Q1 : Q2;
|
P = even? P1 : P2; Q = even? Q1 : Q2;
|
||||||
@ -552,7 +552,7 @@ __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/***************************************************/
|
/***************************************************/
|
||||||
// Die Hash-Funktion
|
|
||||||
__global__ void __launch_bounds__(TPB,4)
|
__global__ void __launch_bounds__(TPB,4)
|
||||||
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
|
x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_temp4)
|
||||||
{
|
{
|
||||||
@ -567,6 +567,7 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u
|
|||||||
|
|
||||||
// Hash einlesen und auf 8 Threads und 2 Register verteilen
|
// Hash einlesen und auf 8 Threads und 2 Register verteilen
|
||||||
uint32_t Hash[2];
|
uint32_t Hash[2];
|
||||||
|
|
||||||
#pragma unroll 2
|
#pragma unroll 2
|
||||||
for (int i=0; i<2; i++)
|
for (int i=0; i<2; i++)
|
||||||
Hash[i] = inpHash[8*i + (threadIdx.x & 7)];
|
Hash[i] = inpHash[8*i + (threadIdx.x & 7)];
|
||||||
@ -622,18 +623,11 @@ x11_simd512_gpu_final_64(int threads, uint32_t startNounce, uint64_t *g_hash, ui
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup-Funktionen
|
__host__
|
||||||
__host__ void x11_simd512_cpu_init(int thr_id, int threads)
|
void x11_simd512_cpu_init(int thr_id, int threads)
|
||||||
{
|
{
|
||||||
cudaMalloc( &d_state[thr_id], 32*sizeof(int)*threads );
|
CUDA_SAFE_CALL(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads));
|
||||||
cudaMalloc( &d_temp4[thr_id], 64*sizeof(uint4)*threads );
|
CUDA_SAFE_CALL(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads));
|
||||||
|
|
||||||
// Textur für 128 Bit Zugriffe
|
|
||||||
cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
|
|
||||||
texRef1D_128.normalized = 0;
|
|
||||||
texRef1D_128.filterMode = cudaFilterModePoint;
|
|
||||||
texRef1D_128.addressMode[0] = cudaAddressModeClamp;
|
|
||||||
cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads);
|
|
||||||
|
|
||||||
cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
|
||||||
@ -644,28 +638,31 @@ __host__ void x11_simd512_cpu_init(int thr_id, int threads)
|
|||||||
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// Texture for 128-Bit Zugriffe
|
||||||
|
cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc<uint4>();
|
||||||
|
texRef1D_128.normalized = 0;
|
||||||
|
texRef1D_128.filterMode = cudaFilterModePoint;
|
||||||
|
texRef1D_128.addressMode[0] = cudaAddressModeClamp;
|
||||||
|
CUDA_SAFE_CALL(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads));
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
__host__
|
||||||
|
void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||||
{
|
{
|
||||||
const int threadsperblock = TPB;
|
const int threadsperblock = TPB;
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs
|
|
||||||
size_t shared_size = 0;
|
|
||||||
|
|
||||||
// berechne wie viele Thread Blocks wir brauchen
|
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8);
|
dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8);
|
||||||
x11_simd512_gpu_expand_64<<<grid8, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]);
|
|
||||||
|
x11_simd512_gpu_expand_64 <<<grid8, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]);
|
||||||
|
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
|
|
||||||
// künstlich die Occupancy limitieren, um das totale Erschöpfen des Texture Cache zu vermeiden
|
x11_simd512_gpu_compress1_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
x11_simd512_gpu_compress1_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
x11_simd512_gpu_compress2_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
x11_simd512_gpu_compress2_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
|
||||||
|
|
||||||
x11_simd512_gpu_final_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
x11_simd512_gpu_final_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]);
|
||||||
|
|
||||||
MyStreamSynchronize(NULL, order, thr_id);
|
MyStreamSynchronize(NULL, order, thr_id);
|
||||||
}
|
}
|
||||||
|
@ -147,7 +147,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
|||||||
if (!init[thr_id])
|
if (!init[thr_id])
|
||||||
{
|
{
|
||||||
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
|
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
|
||||||
// Konstanten kopieren, Speicher belegen
|
|
||||||
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
|
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
|
||||||
|
|
||||||
quark_blake512_cpu_init(thr_id, throughput);
|
quark_blake512_cpu_init(thr_id, throughput);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user