mirror of
https://github.com/GOSTSec/ccminer
synced 2025-08-26 13:51:51 +00:00
simd: cleanup and ignore linux host warning
This commit is contained in:
parent
0d9d3520ac
commit
2308f555c3
@ -112,6 +112,9 @@ x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
|
|||||||
x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu
|
x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
|
||||||
|
|
||||||
|
x11/cuda_x11_simd512.o: x11/cuda_x11_simd512.cu
|
||||||
|
$(NVCC) $(nvcc_FLAGS) -Xcompiler -Wno-unused-variable -o $@ -c $<
|
||||||
|
|
||||||
x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu
|
x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu
|
||||||
$(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $<
|
$(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $<
|
||||||
|
|
||||||
|
@ -1,9 +1,6 @@
|
|||||||
// Parallelization:
|
/***************************************************************************************************
|
||||||
//
|
* SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh)
|
||||||
// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64)
|
*/
|
||||||
// and 1 time 16-fach parallel (in FFT_128_full)
|
|
||||||
//
|
|
||||||
// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
|
|
||||||
|
|
||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
@ -34,7 +31,7 @@ const uint8_t h_perm[8][8] = {
|
|||||||
{ 4, 5, 2, 3, 6, 7, 0, 1 }
|
{ 4, 5, 2, 3, 6, 7, 0, 1 }
|
||||||
};
|
};
|
||||||
|
|
||||||
/* for simd_functions.cuh */
|
/* used in cuda_x11_simd512_func.cuh (SIMD_Compress2) */
|
||||||
#ifdef DEVICE_DIRECT_CONSTANTS
|
#ifdef DEVICE_DIRECT_CONSTANTS
|
||||||
__constant__ uint32_t c_IV_512[32] = {
|
__constant__ uint32_t c_IV_512[32] = {
|
||||||
#else
|
#else
|
||||||
@ -87,22 +84,18 @@ static const short h_FFT256_2_128_Twiddle[128] = {
|
|||||||
-30, 55, -58, -65, -95, -40, -98, 94
|
-30, 55, -58, -65, -95, -40, -98, 94
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/************* the round function ****************/
|
||||||
|
#define IF(x, y, z) (((y ^ z) & x) ^ z)
|
||||||
|
#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
|
||||||
|
|
||||||
#include "cuda_x11_simd512_sm2.cuh"
|
#include "cuda_x11_simd512_sm2.cuh"
|
||||||
|
#include "cuda_x11_simd512_func.cuh"
|
||||||
|
|
||||||
#ifdef __INTELLISENSE__
|
#ifdef __INTELLISENSE__
|
||||||
/* just for vstudio code colors */
|
/* just for vstudio code colors */
|
||||||
#define __CUDA_ARCH__ 500
|
#define __CUDA_ARCH__ 500
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/************* the round function ****************/
|
|
||||||
|
|
||||||
#undef IF
|
|
||||||
#undef MAJ
|
|
||||||
#define IF(x, y, z) (((y ^ z) & x) ^ z)
|
|
||||||
#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
|
|
||||||
|
|
||||||
#include "x11/cuda_x11_simd512_func.cuh"
|
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= 300
|
#if __CUDA_ARCH__ >= 300
|
||||||
|
|
||||||
/********************* Message expansion ************************/
|
/********************* Message expansion ************************/
|
||||||
@ -127,6 +120,13 @@ static const short h_FFT256_2_128_Twiddle[128] = {
|
|||||||
#define REDUCE_FULL_S(x) \
|
#define REDUCE_FULL_S(x) \
|
||||||
EXTRA_REDUCE_S(REDUCE(x))
|
EXTRA_REDUCE_S(REDUCE(x))
|
||||||
|
|
||||||
|
// Parallelization:
|
||||||
|
//
|
||||||
|
// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64)
|
||||||
|
// and 1 time 16-fach parallel (in FFT_128_full)
|
||||||
|
//
|
||||||
|
// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* FFT_8 using w=4 as 8th root of unity
|
* FFT_8 using w=4 as 8th root of unity
|
||||||
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||||
@ -670,14 +670,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
|
|||||||
|
|
||||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
|
||||||
CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
|
||||||
|
|
||||||
#ifndef DEVICE_DIRECT_CONSTANTS
|
#ifndef DEVICE_DIRECT_CONSTANTS
|
||||||
cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);
|
||||||
|
@ -1046,8 +1046,12 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef DEVICE_DIRECT_CONSTANTS
|
||||||
static __constant__ uint32_t d_cw0[8][8] = {
|
static __constant__ uint32_t d_cw0[8][8] = {
|
||||||
//static const uint32_t h_cw0[8][8] = {
|
#else
|
||||||
|
static __constant__ uint32_t d_cw0[8][8];
|
||||||
|
static const uint32_t h_cw0[8][8] = {
|
||||||
|
#endif
|
||||||
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
|
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
|
||||||
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
|
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
|
||||||
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
|
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
|
||||||
@ -1070,8 +1074,12 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t,
|
|||||||
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
|
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef DEVICE_DIRECT_CONSTANTS
|
||||||
static __constant__ uint32_t d_cw1[8][8] = {
|
static __constant__ uint32_t d_cw1[8][8] = {
|
||||||
//static const uint32_t h_cw1[8][8] = {
|
#else
|
||||||
|
static __constant__ uint32_t d_cw1[8][8];
|
||||||
|
static const uint32_t h_cw1[8][8] = {
|
||||||
|
#endif
|
||||||
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
|
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
|
||||||
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
|
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
|
||||||
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
|
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
|
||||||
@ -1094,8 +1102,12 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t,
|
|||||||
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
|
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef DEVICE_DIRECT_CONSTANTS
|
||||||
static __constant__ uint32_t d_cw2[8][8] = {
|
static __constant__ uint32_t d_cw2[8][8] = {
|
||||||
//static const uint32_t h_cw2[8][8] = {
|
#else
|
||||||
|
static __constant__ uint32_t d_cw2[8][8];
|
||||||
|
static const uint32_t h_cw2[8][8] = {
|
||||||
|
#endif
|
||||||
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
|
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
|
||||||
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
|
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
|
||||||
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
|
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
|
||||||
@ -1118,8 +1130,12 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t,
|
|||||||
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
|
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef DEVICE_DIRECT_CONSTANTS
|
||||||
static __constant__ uint32_t d_cw3[8][8] = {
|
static __constant__ uint32_t d_cw3[8][8] = {
|
||||||
//static const uint32_t h_cw3[8][8] = {
|
#else
|
||||||
|
static __constant__ uint32_t d_cw3[8][8];
|
||||||
|
static const uint32_t h_cw3[8][8] = {
|
||||||
|
#endif
|
||||||
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
|
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
|
||||||
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
|
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
|
||||||
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
|
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
|
||||||
|
@ -1,3 +1,9 @@
|
|||||||
|
/***************************************************************************************************
|
||||||
|
* SM 2.x SIMD512 CUDA Implementation without shuffle
|
||||||
|
*
|
||||||
|
* cbuchner 2014 / tpruvot 2015
|
||||||
|
*/
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
#ifdef __INTELLISENSE__
|
#ifdef __INTELLISENSE__
|
||||||
@ -9,7 +15,7 @@
|
|||||||
|
|
||||||
#define T32(x) (x)
|
#define T32(x) (x)
|
||||||
|
|
||||||
#ifndef DEVICE_DIRECT_CONSTANTS /* already made in SM 3+ implementation */
|
#if 0 /* already declared in SM 3+ implementation */
|
||||||
__constant__ uint32_t c_IV_512[32];
|
__constant__ uint32_t c_IV_512[32];
|
||||||
const uint32_t h_IV_512[32] = {
|
const uint32_t h_IV_512[32] = {
|
||||||
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
|
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
|
||||||
@ -51,9 +57,7 @@ static const int h_FFT256_2_128_Twiddle[128] = {
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__constant__ int c_FFT[256] =
|
__constant__ int c_FFT[256] = {
|
||||||
//const int h_FFT[256] =
|
|
||||||
{
|
|
||||||
// this is the FFT result in revbin permuted order
|
// this is the FFT result in revbin permuted order
|
||||||
4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92,
|
4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92,
|
||||||
-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63,
|
-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63,
|
||||||
@ -73,7 +77,6 @@ __constant__ int c_FFT[256] =
|
|||||||
};
|
};
|
||||||
|
|
||||||
__constant__ int c_P8[32][8] = {
|
__constant__ int c_P8[32][8] = {
|
||||||
//static const int h_P8[32][8] = {
|
|
||||||
{ 2, 66, 34, 98, 18, 82, 50, 114 },
|
{ 2, 66, 34, 98, 18, 82, 50, 114 },
|
||||||
{ 6, 70, 38, 102, 22, 86, 54, 118 },
|
{ 6, 70, 38, 102, 22, 86, 54, 118 },
|
||||||
{ 0, 64, 32, 96, 16, 80, 48, 112 },
|
{ 0, 64, 32, 96, 16, 80, 48, 112 },
|
||||||
@ -109,7 +112,6 @@ __constant__ int c_P8[32][8] = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
__constant__ int c_Q8[32][8] = {
|
__constant__ int c_Q8[32][8] = {
|
||||||
//static const int h_Q8[32][8] = {
|
|
||||||
{ 130, 194, 162, 226, 146, 210, 178, 242 },
|
{ 130, 194, 162, 226, 146, 210, 178, 242 },
|
||||||
{ 134, 198, 166, 230, 150, 214, 182, 246 },
|
{ 134, 198, 166, 230, 150, 214, 182, 246 },
|
||||||
{ 128, 192, 160, 224, 144, 208, 176, 240 },
|
{ 128, 192, 160, 224, 144, 208, 176, 240 },
|
||||||
@ -153,8 +155,8 @@ __constant__ int c_Q8[32][8] = {
|
|||||||
|
|
||||||
/************* the round function ****************/
|
/************* the round function ****************/
|
||||||
|
|
||||||
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
|
//#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
|
||||||
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
|
//#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
|
||||||
|
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
|
void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
|
||||||
@ -193,7 +195,6 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
|
|||||||
{
|
{
|
||||||
uint32_t w[8][8];
|
uint32_t w[8][8];
|
||||||
int code = i<2? 185: 233;
|
int code = i<2? 185: 233;
|
||||||
int a, b;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The FFT output y is in revbin permuted order,
|
* The FFT output y is in revbin permuted order,
|
||||||
@ -201,9 +202,9 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(a=0; a<8; a++) {
|
for(int a=0; a<8; a++) {
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(b=0; b<8; b++) {
|
for(int b=0; b<8; b++) {
|
||||||
w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410);
|
w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -249,22 +250,22 @@ void FFT_8(int *y, int stripe)
|
|||||||
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||||
* Output data is in revbin_permuted order.
|
* Output data is in revbin_permuted order.
|
||||||
*/
|
*/
|
||||||
#define X(i) y[stripe*i]
|
#define X(i) y[stripe*i]
|
||||||
|
|
||||||
#define DO_REDUCE(i) \
|
#define DO_REDUCE(i) \
|
||||||
X(i) = REDUCE(X(i))
|
X(i) = REDUCE(X(i))
|
||||||
|
|
||||||
#define DO_REDUCE_FULL_S(i) do { \
|
#define DO_REDUCE_FULL_S(i) { \
|
||||||
X(i) = REDUCE(X(i)); \
|
X(i) = REDUCE(X(i)); \
|
||||||
X(i) = EXTRA_REDUCE_S(X(i)); \
|
X(i) = EXTRA_REDUCE_S(X(i)); \
|
||||||
} while(0)
|
}
|
||||||
|
|
||||||
#define BUTTERFLY(i,j,n) do { \
|
#define BUTTERFLY(i,j,n) { \
|
||||||
int u= X(i); \
|
int u= X(i); \
|
||||||
int v= X(j); \
|
int v= X(j); \
|
||||||
X(i) = u+v; \
|
X(i) = u+v; \
|
||||||
X(j) = (u-v) << (2*n); \
|
X(j) = (u-v) << (2*n); \
|
||||||
} while(0)
|
}
|
||||||
|
|
||||||
BUTTERFLY(0, 4, 0);
|
BUTTERFLY(0, 4, 0);
|
||||||
BUTTERFLY(1, 5, 1);
|
BUTTERFLY(1, 5, 1);
|
||||||
@ -295,10 +296,10 @@ void FFT_8(int *y, int stripe)
|
|||||||
DO_REDUCE_FULL_S(6);
|
DO_REDUCE_FULL_S(6);
|
||||||
DO_REDUCE_FULL_S(7);
|
DO_REDUCE_FULL_S(7);
|
||||||
|
|
||||||
#undef X
|
#undef X
|
||||||
#undef DO_REDUCE
|
#undef DO_REDUCE
|
||||||
#undef DO_REDUCE_FULL_S
|
#undef DO_REDUCE_FULL_S
|
||||||
#undef BUTTERFLY
|
#undef BUTTERFLY
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
@ -315,19 +316,17 @@ void FFT_16(int *y, int stripe)
|
|||||||
#define DO_REDUCE(i) \
|
#define DO_REDUCE(i) \
|
||||||
X(i) = REDUCE(X(i))
|
X(i) = REDUCE(X(i))
|
||||||
|
|
||||||
#define DO_REDUCE_FULL_S(i) \
|
#define DO_REDUCE_FULL_S(i) { \
|
||||||
do { \
|
|
||||||
X(i) = REDUCE(X(i)); \
|
X(i) = REDUCE(X(i)); \
|
||||||
X(i) = EXTRA_REDUCE_S(X(i)); \
|
X(i) = EXTRA_REDUCE_S(X(i)); \
|
||||||
} while(0)
|
}
|
||||||
|
|
||||||
#define BUTTERFLY(i,j,n) \
|
#define BUTTERFLY(i,j,n) { \
|
||||||
do { \
|
|
||||||
int u= X(i); \
|
int u= X(i); \
|
||||||
int v= X(j); \
|
int v= X(j); \
|
||||||
X(i) = u+v; \
|
X(i) = u+v; \
|
||||||
X(j) = (u-v) << n; \
|
X(j) = (u-v) << n; \
|
||||||
} while(0)
|
}
|
||||||
|
|
||||||
BUTTERFLY(0, 8, 0);
|
BUTTERFLY(0, 8, 0);
|
||||||
BUTTERFLY(1, 9, 1);
|
BUTTERFLY(1, 9, 1);
|
||||||
@ -396,10 +395,10 @@ void FFT_16(int *y, int stripe)
|
|||||||
DO_REDUCE_FULL_S(14);
|
DO_REDUCE_FULL_S(14);
|
||||||
DO_REDUCE_FULL_S(15);
|
DO_REDUCE_FULL_S(15);
|
||||||
|
|
||||||
#undef X
|
#undef X
|
||||||
#undef DO_REDUCE
|
#undef DO_REDUCE
|
||||||
#undef DO_REDUCE_FULL_S
|
#undef DO_REDUCE_FULL_S
|
||||||
#undef BUTTERFLY
|
#undef BUTTERFLY
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ __forceinline__
|
__device__ __forceinline__
|
||||||
@ -549,7 +548,7 @@ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNou
|
|||||||
|
|
||||||
#else
|
#else
|
||||||
__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
|
__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
|
||||||
#endif /* __CUDA_ARCH__ */
|
#endif /* __CUDA_ARCH__ < 300 */
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
static void x11_simd512_cpu_init_sm2(int thr_id)
|
static void x11_simd512_cpu_init_sm2(int thr_id)
|
||||||
@ -559,9 +558,6 @@ static void x11_simd512_cpu_init_sm2(int thr_id)
|
|||||||
cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
|
cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
#endif
|
#endif
|
||||||
// cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice);
|
|
||||||
// cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice);
|
|
||||||
// cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
|
Loading…
x
Reference in New Issue
Block a user