Browse Source

simd: cleanup and ignore linux host warning

2upstream
Tanguy Pruvot 9 years ago
parent
commit
2308f555c3
  1. 3
      Makefile.am
  2. 35
      x11/cuda_x11_simd512.cu
  3. 88
      x11/cuda_x11_simd512_func.cuh
  4. 94
      x11/cuda_x11_simd512_sm2.cuh

3
Makefile.am

@ -112,6 +112,9 @@ x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu @@ -112,6 +112,9 @@ x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
x11/cuda_x11_simd512.o: x11/cuda_x11_simd512.cu
$(NVCC) $(nvcc_FLAGS) -Xcompiler -Wno-unused-variable -o $@ -c $<
x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $<

35
x11/cuda_x11_simd512.cu

@ -1,9 +1,6 @@ @@ -1,9 +1,6 @@
// Parallelization:
//
// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64)
// and 1 time 16-fach parallel (in FFT_128_full)
//
// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
/***************************************************************************************************
* SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh)
*/
#include "miner.h"
#include "cuda_helper.h"
@ -34,7 +31,7 @@ const uint8_t h_perm[8][8] = { @@ -34,7 +31,7 @@ const uint8_t h_perm[8][8] = {
{ 4, 5, 2, 3, 6, 7, 0, 1 }
};
/* for simd_functions.cuh */
/* used in cuda_x11_simd512_func.cuh (SIMD_Compress2) */
#ifdef DEVICE_DIRECT_CONSTANTS
__constant__ uint32_t c_IV_512[32] = {
#else
@ -87,22 +84,18 @@ static const short h_FFT256_2_128_Twiddle[128] = { @@ -87,22 +84,18 @@ static const short h_FFT256_2_128_Twiddle[128] = {
-30, 55, -58, -65, -95, -40, -98, 94
};
/************* the round function ****************/
#define IF(x, y, z) (((y ^ z) & x) ^ z)
#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
#include "cuda_x11_simd512_sm2.cuh"
#include "cuda_x11_simd512_func.cuh"
#ifdef __INTELLISENSE__
/* just for vstudio code colors */
#define __CUDA_ARCH__ 500
#endif
/************* the round function ****************/
#undef IF
#undef MAJ
#define IF(x, y, z) (((y ^ z) & x) ^ z)
#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
#include "x11/cuda_x11_simd512_func.cuh"
#if __CUDA_ARCH__ >= 300
/********************* Message expansion ************************/
@ -127,6 +120,13 @@ static const short h_FFT256_2_128_Twiddle[128] = { @@ -127,6 +120,13 @@ static const short h_FFT256_2_128_Twiddle[128] = {
#define REDUCE_FULL_S(x) \
EXTRA_REDUCE_S(REDUCE(x))
// Parallelization:
//
// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64)
// and 1 time 16-fach parallel (in FFT_128_full)
//
// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
/**
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
@ -670,14 +670,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads) @@ -670,14 +670,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
#ifndef DEVICE_DIRECT_CONSTANTS
cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
#endif
#if 0
cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);

88
x11/cuda_x11_simd512_func.cuh

@ -1046,16 +1046,20 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons @@ -1046,16 +1046,20 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
}
}
#ifdef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t d_cw0[8][8] = {
//static const uint32_t h_cw0[8][8] = {
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09,
0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
#else
static __constant__ uint32_t d_cw0[8][8];
static const uint32_t h_cw0[8][8] = {
#endif
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09,
0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
};
__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u)
@ -1070,16 +1074,20 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, @@ -1070,16 +1074,20 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
}
#ifdef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t d_cw1[8][8] = {
//static const uint32_t h_cw1[8][8] = {
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D,
0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
#else
static __constant__ uint32_t d_cw1[8][8];
static const uint32_t h_cw1[8][8] = {
#endif
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D,
0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
};
__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u)
@ -1094,16 +1102,20 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, @@ -1094,16 +1102,20 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
}
#ifdef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t d_cw2[8][8] = {
//static const uint32_t h_cw2[8][8] = {
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80,
0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
#else
static __constant__ uint32_t d_cw2[8][8];
static const uint32_t h_cw2[8][8] = {
#endif
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80,
0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
};
__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u)
@ -1118,16 +1130,20 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, @@ -1118,16 +1130,20 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
}
#ifdef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t d_cw3[8][8] = {
//static const uint32_t h_cw3[8][8] = {
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583,
0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
#else
static __constant__ uint32_t d_cw3[8][8];
static const uint32_t h_cw3[8][8] = {
#endif
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583,
0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
};
__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u)

94
x11/cuda_x11_simd512_sm2.cuh

@ -1,3 +1,9 @@ @@ -1,3 +1,9 @@
/***************************************************************************************************
* SM 2.x SIMD512 CUDA Implementation without shuffle
*
* cbuchner 2014 / tpruvot 2015
*/
#include "cuda_helper.h"
#ifdef __INTELLISENSE__
@ -9,7 +15,7 @@ @@ -9,7 +15,7 @@
#define T32(x) (x)
#ifndef DEVICE_DIRECT_CONSTANTS /* already made in SM 3+ implementation */
#if 0 /* already declared in SM 3+ implementation */
__constant__ uint32_t c_IV_512[32];
const uint32_t h_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
@ -51,9 +57,7 @@ static const int h_FFT256_2_128_Twiddle[128] = { @@ -51,9 +57,7 @@ static const int h_FFT256_2_128_Twiddle[128] = {
};
#endif
__constant__ int c_FFT[256] =
//const int h_FFT[256] =
{
__constant__ int c_FFT[256] = {
// this is the FFT result in revbin permuted order
4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92,
-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63,
@ -73,7 +77,6 @@ __constant__ int c_FFT[256] = @@ -73,7 +77,6 @@ __constant__ int c_FFT[256] =
};
__constant__ int c_P8[32][8] = {
//static const int h_P8[32][8] = {
{ 2, 66, 34, 98, 18, 82, 50, 114 },
{ 6, 70, 38, 102, 22, 86, 54, 118 },
{ 0, 64, 32, 96, 16, 80, 48, 112 },
@ -109,7 +112,6 @@ __constant__ int c_P8[32][8] = { @@ -109,7 +112,6 @@ __constant__ int c_P8[32][8] = {
};
__constant__ int c_Q8[32][8] = {
//static const int h_Q8[32][8] = {
{ 130, 194, 162, 226, 146, 210, 178, 242 },
{ 134, 198, 166, 230, 150, 214, 182, 246 },
{ 128, 192, 160, 224, 144, 208, 176, 240 },
@ -153,8 +155,8 @@ __constant__ int c_Q8[32][8] = { @@ -153,8 +155,8 @@ __constant__ int c_Q8[32][8] = {
/************* the round function ****************/
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
//#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
//#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
__device__ __forceinline__
void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
@ -193,7 +195,6 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u) @@ -193,7 +195,6 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
{
uint32_t w[8][8];
int code = i<2? 185: 233;
int a, b;
/*
* The FFT output y is in revbin permuted order,
@ -201,9 +202,9 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u) @@ -201,9 +202,9 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
*/
#pragma unroll 8
for(a=0; a<8; a++) {
for(int a=0; a<8; a++) {
#pragma unroll 8
for(b=0; b<8; b++) {
for(int b=0; b<8; b++) {
w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410);
}
}
@ -244,27 +245,27 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u) @@ -244,27 +245,27 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
__device__ __forceinline__
void FFT_8(int *y, int stripe)
{
/*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
#define X(i) y[stripe*i]
/*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
#define X(i) y[stripe*i]
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) do { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
#define DO_REDUCE_FULL_S(i) { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
}
#define BUTTERFLY(i,j,n) do { \
int u= X(i); \
int v= X(j); \
X(i) = u+v; \
X(j) = (u-v) << (2*n); \
} while(0)
#define BUTTERFLY(i,j,n) { \
int u= X(i); \
int v= X(j); \
X(i) = u+v; \
X(j) = (u-v) << (2*n); \
}
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
@ -295,10 +296,10 @@ void FFT_8(int *y, int stripe) @@ -295,10 +296,10 @@ void FFT_8(int *y, int stripe)
DO_REDUCE_FULL_S(6);
DO_REDUCE_FULL_S(7);
#undef X
#undef DO_REDUCE
#undef DO_REDUCE_FULL_S
#undef BUTTERFLY
#undef X
#undef DO_REDUCE
#undef DO_REDUCE_FULL_S
#undef BUTTERFLY
}
__device__ __forceinline__
@ -315,19 +316,17 @@ void FFT_16(int *y, int stripe) @@ -315,19 +316,17 @@ void FFT_16(int *y, int stripe)
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) \
do { \
#define DO_REDUCE_FULL_S(i) { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
}
#define BUTTERFLY(i,j,n) \
do { \
#define BUTTERFLY(i,j,n) { \
int u= X(i); \
int v= X(j); \
X(i) = u+v; \
X(j) = (u-v) << n; \
} while(0)
}
BUTTERFLY(0, 8, 0);
BUTTERFLY(1, 9, 1);
@ -396,10 +395,10 @@ void FFT_16(int *y, int stripe) @@ -396,10 +395,10 @@ void FFT_16(int *y, int stripe)
DO_REDUCE_FULL_S(14);
DO_REDUCE_FULL_S(15);
#undef X
#undef DO_REDUCE
#undef DO_REDUCE_FULL_S
#undef BUTTERFLY
#undef X
#undef DO_REDUCE
#undef DO_REDUCE_FULL_S
#undef BUTTERFLY
}
__device__ __forceinline__
@ -549,19 +548,16 @@ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNou @@ -549,19 +548,16 @@ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNou
#else
__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
#endif /* __CUDA_ARCH__ */
#endif /* __CUDA_ARCH__ < 300 */
__host__
static void x11_simd512_cpu_init_sm2(int thr_id)
static void x11_simd512_cpu_init_sm2(int thr_id)
{
#ifndef DEVICE_DIRECT_CONSTANTS
cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
#endif
// cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice);
// cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice);
// cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice);
}
__host__
@ -576,4 +572,4 @@ static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t s @@ -576,4 +572,4 @@ static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t s
x11_simd512_gpu_hash_64_sm2<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}
}

Loading…
Cancel
Save