Browse Source

simd: cleanup and ignore linux host warning

master
Tanguy Pruvot 9 years ago
parent
commit
2308f555c3
  1. 3
      Makefile.am
  2. 35
      x11/cuda_x11_simd512.cu
  3. 88
      x11/cuda_x11_simd512_func.cuh
  4. 94
      x11/cuda_x11_simd512_sm2.cuh

3
Makefile.am

@ -112,6 +112,9 @@ x11/cuda_x11_luffa512.o: x11/cuda_x11_luffa512.cu
x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu x11/cuda_x11_luffa512_Cubehash.o: x11/cuda_x11_luffa512_Cubehash.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $<
x11/cuda_x11_simd512.o: x11/cuda_x11_simd512.cu
$(NVCC) $(nvcc_FLAGS) -Xcompiler -Wno-unused-variable -o $@ -c $<
x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu x13/cuda_x13_hamsi512.o: x13/cuda_x13_hamsi512.cu
$(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $< $(NVCC) $(nvcc_FLAGS) --maxrregcount=72 -o $@ -c $<

35
x11/cuda_x11_simd512.cu

@ -1,9 +1,6 @@
// Parallelization: /***************************************************************************************************
// * SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh)
// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64) */
// and 1 time 16-fach parallel (in FFT_128_full)
//
// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
#include "miner.h" #include "miner.h"
#include "cuda_helper.h" #include "cuda_helper.h"
@ -34,7 +31,7 @@ const uint8_t h_perm[8][8] = {
{ 4, 5, 2, 3, 6, 7, 0, 1 } { 4, 5, 2, 3, 6, 7, 0, 1 }
}; };
/* for simd_functions.cuh */ /* used in cuda_x11_simd512_func.cuh (SIMD_Compress2) */
#ifdef DEVICE_DIRECT_CONSTANTS #ifdef DEVICE_DIRECT_CONSTANTS
__constant__ uint32_t c_IV_512[32] = { __constant__ uint32_t c_IV_512[32] = {
#else #else
@ -87,22 +84,18 @@ static const short h_FFT256_2_128_Twiddle[128] = {
-30, 55, -58, -65, -95, -40, -98, 94 -30, 55, -58, -65, -95, -40, -98, 94
}; };
/************* the round function ****************/
#define IF(x, y, z) (((y ^ z) & x) ^ z)
#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
#include "cuda_x11_simd512_sm2.cuh" #include "cuda_x11_simd512_sm2.cuh"
#include "cuda_x11_simd512_func.cuh"
#ifdef __INTELLISENSE__ #ifdef __INTELLISENSE__
/* just for vstudio code colors */ /* just for vstudio code colors */
#define __CUDA_ARCH__ 500 #define __CUDA_ARCH__ 500
#endif #endif
/************* the round function ****************/
#undef IF
#undef MAJ
#define IF(x, y, z) (((y ^ z) & x) ^ z)
#define MAJ(x, y, z) ((z &y) | ((z|y) & x))
#include "x11/cuda_x11_simd512_func.cuh"
#if __CUDA_ARCH__ >= 300 #if __CUDA_ARCH__ >= 300
/********************* Message expansion ************************/ /********************* Message expansion ************************/
@ -127,6 +120,13 @@ static const short h_FFT256_2_128_Twiddle[128] = {
#define REDUCE_FULL_S(x) \ #define REDUCE_FULL_S(x) \
EXTRA_REDUCE_S(REDUCE(x)) EXTRA_REDUCE_S(REDUCE(x))
// Parallelization:
//
// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64)
// and 1 time 16-fach parallel (in FFT_128_full)
//
// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations
/** /**
* FFT_8 using w=4 as 8th root of unity * FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT. * Unrolled decimation in frequency (DIF) radix-2 NTT.
@ -670,14 +670,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads)
CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */ CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */
CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err); CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err);
#ifndef DEVICE_DIRECT_CONSTANTS #ifndef DEVICE_DIRECT_CONSTANTS
cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
#endif
#if 0
cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice);

88
x11/cuda_x11_simd512_func.cuh

@ -1046,16 +1046,20 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons
} }
} }
#ifdef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t d_cw0[8][8] = { static __constant__ uint32_t d_cw0[8][8] = {
//static const uint32_t h_cw0[8][8] = { #else
0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, static __constant__ uint32_t d_cw0[8][8];
0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, static const uint32_t h_cw0[8][8] = {
0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, #endif
0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3, 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6,
0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380,
0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE, 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8,
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3,
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09,
0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE,
0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E,
0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3
}; };
__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u)
@ -1070,16 +1074,20 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A);
} }
#ifdef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t d_cw1[8][8] = { static __constant__ uint32_t d_cw1[8][8] = {
//static const uint32_t h_cw1[8][8] = { #else
0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, static __constant__ uint32_t d_cw1[8][8];
0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, static const uint32_t h_cw1[8][8] = {
0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, #endif
0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318, 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7,
0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2,
0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12, 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A,
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318,
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D,
0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12,
0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D,
0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80
}; };
__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u)
@ -1094,16 +1102,20 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A);
} }
#ifdef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t d_cw2[8][8] = { static __constant__ uint32_t d_cw2[8][8] = {
//static const uint32_t h_cw2[8][8] = { #else
0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, static __constant__ uint32_t d_cw2[8][8];
0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, static const uint32_t h_cw2[8][8] = {
0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, #endif
0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6, 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3,
0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3,
0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E, 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539,
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6,
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE 0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80,
0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E,
0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468,
0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE
}; };
__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u)
@ -1118,16 +1130,20 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t,
STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A);
} }
#ifdef DEVICE_DIRECT_CONSTANTS
static __constant__ uint32_t d_cw3[8][8] = { static __constant__ uint32_t d_cw3[8][8] = {
//static const uint32_t h_cw3[8][8] = { #else
0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, static __constant__ uint32_t d_cw3[8][8];
0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, static const uint32_t h_cw3[8][8] = {
0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, #endif
0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6, 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D,
0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B,
0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F, 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A,
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6,
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D 0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583,
0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F,
0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA,
0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D
}; };
__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) __device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u)

94
x11/cuda_x11_simd512_sm2.cuh

@ -1,3 +1,9 @@
/***************************************************************************************************
* SM 2.x SIMD512 CUDA Implementation without shuffle
*
* cbuchner 2014 / tpruvot 2015
*/
#include "cuda_helper.h" #include "cuda_helper.h"
#ifdef __INTELLISENSE__ #ifdef __INTELLISENSE__
@ -9,7 +15,7 @@
#define T32(x) (x) #define T32(x) (x)
#ifndef DEVICE_DIRECT_CONSTANTS /* already made in SM 3+ implementation */ #if 0 /* already declared in SM 3+ implementation */
__constant__ uint32_t c_IV_512[32]; __constant__ uint32_t c_IV_512[32];
const uint32_t h_IV_512[32] = { const uint32_t h_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
@ -51,9 +57,7 @@ static const int h_FFT256_2_128_Twiddle[128] = {
}; };
#endif #endif
__constant__ int c_FFT[256] = __constant__ int c_FFT[256] = {
//const int h_FFT[256] =
{
// this is the FFT result in revbin permuted order // this is the FFT result in revbin permuted order
4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92, 4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92,
-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63, -119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63,
@ -73,7 +77,6 @@ __constant__ int c_FFT[256] =
}; };
__constant__ int c_P8[32][8] = { __constant__ int c_P8[32][8] = {
//static const int h_P8[32][8] = {
{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 2, 66, 34, 98, 18, 82, 50, 114 },
{ 6, 70, 38, 102, 22, 86, 54, 118 }, { 6, 70, 38, 102, 22, 86, 54, 118 },
{ 0, 64, 32, 96, 16, 80, 48, 112 }, { 0, 64, 32, 96, 16, 80, 48, 112 },
@ -109,7 +112,6 @@ __constant__ int c_P8[32][8] = {
}; };
__constant__ int c_Q8[32][8] = { __constant__ int c_Q8[32][8] = {
//static const int h_Q8[32][8] = {
{ 130, 194, 162, 226, 146, 210, 178, 242 }, { 130, 194, 162, 226, 146, 210, 178, 242 },
{ 134, 198, 166, 230, 150, 214, 182, 246 }, { 134, 198, 166, 230, 150, 214, 182, 246 },
{ 128, 192, 160, 224, 144, 208, 176, 240 }, { 128, 192, 160, 224, 144, 208, 176, 240 },
@ -153,8 +155,8 @@ __constant__ int c_Q8[32][8] = {
/************* the round function ****************/ /************* the round function ****************/
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) //#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) //#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
__device__ __forceinline__ __device__ __forceinline__
void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
@ -193,7 +195,6 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
{ {
uint32_t w[8][8]; uint32_t w[8][8];
int code = i<2? 185: 233; int code = i<2? 185: 233;
int a, b;
/* /*
* The FFT output y is in revbin permuted order, * The FFT output y is in revbin permuted order,
@ -201,9 +202,9 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
*/ */
#pragma unroll 8 #pragma unroll 8
for(a=0; a<8; a++) { for(int a=0; a<8; a++) {
#pragma unroll 8 #pragma unroll 8
for(b=0; b<8; b++) { for(int b=0; b<8; b++) {
w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410); w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410);
} }
} }
@ -244,27 +245,27 @@ void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u)
__device__ __forceinline__ __device__ __forceinline__
void FFT_8(int *y, int stripe) void FFT_8(int *y, int stripe)
{ {
/* /*
* FFT_8 using w=4 as 8th root of unity * FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT. * Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order. * Output data is in revbin_permuted order.
*/ */
#define X(i) y[stripe*i] #define X(i) y[stripe*i]
#define DO_REDUCE(i) \ #define DO_REDUCE(i) \
X(i) = REDUCE(X(i)) X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) do { \ #define DO_REDUCE_FULL_S(i) { \
X(i) = REDUCE(X(i)); \ X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \ X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0) }
#define BUTTERFLY(i,j,n) do { \ #define BUTTERFLY(i,j,n) { \
int u= X(i); \ int u= X(i); \
int v= X(j); \ int v= X(j); \
X(i) = u+v; \ X(i) = u+v; \
X(j) = (u-v) << (2*n); \ X(j) = (u-v) << (2*n); \
} while(0) }
BUTTERFLY(0, 4, 0); BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1); BUTTERFLY(1, 5, 1);
@ -295,10 +296,10 @@ void FFT_8(int *y, int stripe)
DO_REDUCE_FULL_S(6); DO_REDUCE_FULL_S(6);
DO_REDUCE_FULL_S(7); DO_REDUCE_FULL_S(7);
#undef X #undef X
#undef DO_REDUCE #undef DO_REDUCE
#undef DO_REDUCE_FULL_S #undef DO_REDUCE_FULL_S
#undef BUTTERFLY #undef BUTTERFLY
} }
__device__ __forceinline__ __device__ __forceinline__
@ -315,19 +316,17 @@ void FFT_16(int *y, int stripe)
#define DO_REDUCE(i) \ #define DO_REDUCE(i) \
X(i) = REDUCE(X(i)) X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) \ #define DO_REDUCE_FULL_S(i) { \
do { \
X(i) = REDUCE(X(i)); \ X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \ X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0) }
#define BUTTERFLY(i,j,n) \ #define BUTTERFLY(i,j,n) { \
do { \
int u= X(i); \ int u= X(i); \
int v= X(j); \ int v= X(j); \
X(i) = u+v; \ X(i) = u+v; \
X(j) = (u-v) << n; \ X(j) = (u-v) << n; \
} while(0) }
BUTTERFLY(0, 8, 0); BUTTERFLY(0, 8, 0);
BUTTERFLY(1, 9, 1); BUTTERFLY(1, 9, 1);
@ -396,10 +395,10 @@ void FFT_16(int *y, int stripe)
DO_REDUCE_FULL_S(14); DO_REDUCE_FULL_S(14);
DO_REDUCE_FULL_S(15); DO_REDUCE_FULL_S(15);
#undef X #undef X
#undef DO_REDUCE #undef DO_REDUCE
#undef DO_REDUCE_FULL_S #undef DO_REDUCE_FULL_S
#undef BUTTERFLY #undef BUTTERFLY
} }
__device__ __forceinline__ __device__ __forceinline__
@ -549,19 +548,16 @@ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNou
#else #else
__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {} __global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {}
#endif /* __CUDA_ARCH__ */ #endif /* __CUDA_ARCH__ < 300 */
__host__ __host__
static void x11_simd512_cpu_init_sm2(int thr_id) static void x11_simd512_cpu_init_sm2(int thr_id)
{ {
#ifndef DEVICE_DIRECT_CONSTANTS #ifndef DEVICE_DIRECT_CONSTANTS
cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
#endif #endif
// cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice);
// cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice);
// cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice);
} }
__host__ __host__
@ -576,4 +572,4 @@ static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t s
x11_simd512_gpu_hash_64_sm2<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_simd512_gpu_hash_64_sm2<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

Loading…
Cancel
Save