From 2308f555c3237e4ddf0e8ee91eaa185fcbf3b536 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 1 Nov 2015 13:34:44 +0100 Subject: [PATCH] simd: cleanup and ignore linux host warning --- Makefile.am | 3 ++ x11/cuda_x11_simd512.cu | 35 +++++++------ x11/cuda_x11_simd512_func.cuh | 88 ++++++++++++++++++-------------- x11/cuda_x11_simd512_sm2.cuh | 94 +++++++++++++++++------------------ 4 files changed, 117 insertions(+), 103 deletions(-) diff --git a/Makefile.am b/Makefile.am index b4e91d0..e53fb46 100644 --- a/Makefile.am +++ b/Makefile.am @@ -112,6 +112,9 @@ x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $< +x11/cuda_x11_simd512.o: x11/cuda_x11_simd512.cu + $(NVCC) $(nvcc_FLAGS) -Xcompiler -Wno-unused-variable -o $@ -c $< + x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $< diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 48958b2..5495eda 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -1,9 +1,6 @@ -// Parallelization: -// -// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64) -// and 1 time 16-fach parallel (in FFT_128_full) -// -// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations +/*************************************************************************************************** + * SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh) + */ #include "miner.h" #include "cuda_helper.h" @@ -34,7 +31,7 @@ const uint8_t h_perm[8][8] = { { 4, 5, 2, 3, 6, 7, 0, 1 } }; -/* for simd_functions.cuh */ +/* used in cuda_x11_simd512_func.cuh (SIMD_Compress2) */ #ifdef DEVICE_DIRECT_CONSTANTS __constant__ uint32_t c_IV_512[32] = { #else @@ -87,22 +84,18 @@ static const short h_FFT256_2_128_Twiddle[128] = { -30, 55, -58, -65, -95, -40, -98, 94 }; +/************* the round function ****************/ +#define IF(x, y, z) (((y ^ z) & x) ^ z) +#define MAJ(x, y, z) ((z &y) | ((z|y) & x)) + #include "cuda_x11_simd512_sm2.cuh" +#include "cuda_x11_simd512_func.cuh" #ifdef __INTELLISENSE__ /* just for vstudio code colors */ #define __CUDA_ARCH__ 500 #endif -/************* the round function ****************/ - -#undef IF -#undef MAJ -#define IF(x, y, z) (((y ^ z) & x) ^ z) -#define MAJ(x, y, z) ((z &y) | ((z|y) & x)) - -#include "x11/cuda_x11_simd512_func.cuh" - #if __CUDA_ARCH__ >= 300 /********************* Message expansion ************************/ @@ -127,6 +120,13 @@ static const short h_FFT256_2_128_Twiddle[128] = { #define REDUCE_FULL_S(x) \ EXTRA_REDUCE_S(REDUCE(x)) +// Parallelization: +// +// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64) +// and 1 time 16-fach parallel (in FFT_128_full) +// +// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations + /** * FFT_8 using w=4 as 8th root of unity * Unrolled decimation in frequency (DIF) radix-2 NTT. @@ -670,14 +670,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads) CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */ CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err); + #ifndef DEVICE_DIRECT_CONSTANTS cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); -#endif -#if 0 cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); diff --git a/x11/cuda_x11_simd512_func.cuh b/x11/cuda_x11_simd512_func.cuh index 7128d61..f61eaa4 100644 --- a/x11/cuda_x11_simd512_func.cuh +++ b/x11/cuda_x11_simd512_func.cuh @@ -1046,16 +1046,20 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons } } +#ifdef DEVICE_DIRECT_CONSTANTS static __constant__ uint32_t d_cw0[8][8] = { -//static const uint32_t h_cw0[8][8] = { - 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, - 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, - 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, - 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3, - 0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, - 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE, - 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, - 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 +#else +static __constant__ uint32_t d_cw0[8][8]; +static const uint32_t h_cw0[8][8] = { +#endif + 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, + 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, + 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, + 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3, + 0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, + 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE, + 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, + 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 }; __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) @@ -1070,16 +1074,20 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); } +#ifdef DEVICE_DIRECT_CONSTANTS static __constant__ uint32_t d_cw1[8][8] = { -//static const uint32_t h_cw1[8][8] = { - 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, - 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, - 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, - 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318, - 0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, - 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12, - 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, - 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 +#else +static __constant__ uint32_t d_cw1[8][8]; +static const uint32_t h_cw1[8][8] = { +#endif + 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, + 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, + 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, + 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318, + 0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, + 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12, + 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, + 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 }; __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) @@ -1094,16 +1102,20 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); } +#ifdef DEVICE_DIRECT_CONSTANTS static __constant__ uint32_t d_cw2[8][8] = { -//static const uint32_t h_cw2[8][8] = { - 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, - 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, - 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, - 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6, - 0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, - 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E, - 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, - 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE +#else +static __constant__ uint32_t d_cw2[8][8]; +static const uint32_t h_cw2[8][8] = { +#endif + 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, + 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, + 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, + 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6, + 0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, + 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E, + 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, + 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE }; __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) @@ -1118,16 +1130,20 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); } +#ifdef DEVICE_DIRECT_CONSTANTS static __constant__ uint32_t d_cw3[8][8] = { -//static const uint32_t h_cw3[8][8] = { - 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, - 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, - 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, - 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6, - 0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, - 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F, - 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, - 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D +#else +static __constant__ uint32_t d_cw3[8][8]; +static const uint32_t h_cw3[8][8] = { +#endif + 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, + 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, + 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, + 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6, + 0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, + 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F, + 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, + 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D }; __device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) diff --git a/x11/cuda_x11_simd512_sm2.cuh b/x11/cuda_x11_simd512_sm2.cuh index 7ed927c..1c5b314 100644 --- a/x11/cuda_x11_simd512_sm2.cuh +++ b/x11/cuda_x11_simd512_sm2.cuh @@ -1,3 +1,9 @@ +/*************************************************************************************************** + * SM 2.x SIMD512 CUDA Implementation without shuffle + * + * cbuchner 2014 / tpruvot 2015 + */ + #include "cuda_helper.h" #ifdef __INTELLISENSE__ @@ -9,7 +15,7 @@ #define T32(x) (x) -#ifndef DEVICE_DIRECT_CONSTANTS /* already made in SM 3+ implementation */ +#if 0 /* already declared in SM 3+ implementation */ __constant__ uint32_t c_IV_512[32]; const uint32_t h_IV_512[32] = { 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, @@ -51,9 +57,7 @@ static const int h_FFT256_2_128_Twiddle[128] = { }; #endif -__constant__ int c_FFT[256] = -//const int h_FFT[256] = -{ +__constant__ int c_FFT[256] = { // this is the FFT result in revbin permuted order 4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92, -119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63, @@ -73,7 +77,6 @@ __constant__ int c_FFT[256] = }; __constant__ int c_P8[32][8] = { -//static const int h_P8[32][8] = { { 2, 66, 34, 98, 18, 82, 50, 114 }, { 6, 70, 38, 102, 22, 86, 54, 118 }, { 0, 64, 32, 96, 16, 80, 48, 112 }, @@ -109,7 +112,6 @@ __constant__ int c_P8[32][8] = { }; __constant__ int c_Q8[32][8] = { -//static const int h_Q8[32][8] = { { 130, 194, 162, 226, 146, 210, 178, 242 }, { 134, 198, 166, 230, 150, 214, 182, 246 }, { 128, 192, 160, 224, 144, 208, 176, 240 }, @@ -153,8 +155,8 @@ __constant__ int c_Q8[32][8] = { /************* the round function ****************/ -#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) -#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) +//#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +//#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) __device__ __forceinline__ void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) @@ -193,7 +195,6 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u) { uint32_t w[8][8]; int code = i<2? 185: 233; - int a, b; /* * The FFT output y is in revbin permuted order, @@ -201,9 +202,9 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u) */ #pragma unroll 8 - for(a=0; a<8; a++) { + for(int a=0; a<8; a++) { #pragma unroll 8 - for(b=0; b<8; b++) { + for(int b=0; b<8; b++) { w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410); } } @@ -244,27 +245,27 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u) __device__ __forceinline__ void FFT_8(int *y, int stripe) { - /* - * FFT_8 using w=4 as 8th root of unity - * Unrolled decimation in frequency (DIF) radix-2 NTT. - * Output data is in revbin_permuted order. - */ -#define X(i) y[stripe*i] + /* + * FFT_8 using w=4 as 8th root of unity + * Unrolled decimation in frequency (DIF) radix-2 NTT. + * Output data is in revbin_permuted order. + */ + #define X(i) y[stripe*i] -#define DO_REDUCE(i) \ - X(i) = REDUCE(X(i)) + #define DO_REDUCE(i) \ + X(i) = REDUCE(X(i)) -#define DO_REDUCE_FULL_S(i) do { \ - X(i) = REDUCE(X(i)); \ - X(i) = EXTRA_REDUCE_S(X(i)); \ -} while(0) + #define DO_REDUCE_FULL_S(i) { \ + X(i) = REDUCE(X(i)); \ + X(i) = EXTRA_REDUCE_S(X(i)); \ + } -#define BUTTERFLY(i,j,n) do { \ - int u= X(i); \ - int v= X(j); \ - X(i) = u+v; \ - X(j) = (u-v) << (2*n); \ -} while(0) + #define BUTTERFLY(i,j,n) { \ + int u= X(i); \ + int v= X(j); \ + X(i) = u+v; \ + X(j) = (u-v) << (2*n); \ + } BUTTERFLY(0, 4, 0); BUTTERFLY(1, 5, 1); @@ -295,10 +296,10 @@ void FFT_8(int *y, int stripe) DO_REDUCE_FULL_S(6); DO_REDUCE_FULL_S(7); -#undef X -#undef DO_REDUCE -#undef DO_REDUCE_FULL_S -#undef BUTTERFLY + #undef X + #undef DO_REDUCE + #undef DO_REDUCE_FULL_S + #undef BUTTERFLY } __device__ __forceinline__ @@ -315,19 +316,17 @@ void FFT_16(int *y, int stripe) #define DO_REDUCE(i) \ X(i) = REDUCE(X(i)) - #define DO_REDUCE_FULL_S(i) \ - do { \ + #define DO_REDUCE_FULL_S(i) { \ X(i) = REDUCE(X(i)); \ X(i) = EXTRA_REDUCE_S(X(i)); \ - } while(0) + } - #define BUTTERFLY(i,j,n) \ - do { \ + #define BUTTERFLY(i,j,n) { \ int u= X(i); \ int v= X(j); \ X(i) = u+v; \ X(j) = (u-v) << n; \ - } while(0) + } BUTTERFLY(0, 8, 0); BUTTERFLY(1, 9, 1); @@ -396,10 +395,10 @@ void FFT_16(int *y, int stripe) DO_REDUCE_FULL_S(14); DO_REDUCE_FULL_S(15); -#undef X -#undef DO_REDUCE -#undef DO_REDUCE_FULL_S -#undef BUTTERFLY + #undef X + #undef DO_REDUCE + #undef DO_REDUCE_FULL_S + #undef BUTTERFLY } __device__ __forceinline__ @@ -549,19 +548,16 @@ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNou #else __global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {} -#endif /* __CUDA_ARCH__ */ +#endif /* __CUDA_ARCH__ < 300 */ __host__ -static void x11_simd512_cpu_init_sm2(int thr_id) +static void x11_simd512_cpu_init_sm2(int thr_id) { #ifndef DEVICE_DIRECT_CONSTANTS cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); #endif -// cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice); -// cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice); -// cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice); } __host__ @@ -576,4 +572,4 @@ static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t s x11_simd512_gpu_hash_64_sm2<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); MyStreamSynchronize(NULL, order, thr_id); -} \ No newline at end of file +}