diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu index c326942..64d7aab 100644 --- a/JHA/jackpotcoin.cu +++ b/JHA/jackpotcoin.cu @@ -89,7 +89,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000f; + ptarget[7] = 0x000f; if (!init[thr_id]) { @@ -100,7 +100,6 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc proper_exit(EXIT_CODE_CUDA_ERROR); } - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); jackpot_keccak512_cpu_init(thr_id, throughput); diff --git a/bench.cpp b/bench.cpp index aab5d0f..6e94e31 100644 --- a/bench.cpp +++ b/bench.cpp @@ -100,16 +100,12 @@ bool bench_algo_switch_next(int thr_id) if (device_sm[dev_id] && device_sm[dev_id] < 300) { // incompatible SM 2.1 kernels... - if (algo == ALGO_FRESH) algo++; if (algo == ALGO_GROESTL) algo++; if (algo == ALGO_MYR_GR) algo++; - if (algo == ALGO_JACKPOT) algo++; + if (algo == ALGO_JACKPOT) algo++; // compact shuffle if (algo == ALGO_LYRA2v2) algo++; if (algo == ALGO_NEOSCRYPT) algo++; - if (algo == ALGO_QUARK) algo++; - if (algo == ALGO_QUBIT) algo++; - if (algo == ALGO_S3) algo++; // to check... - while (algo >= ALGO_X11 && algo <= ALGO_X17) algo++; + if (algo == ALGO_QUARK) algo++; // todo if (algo == ALGO_WHIRLPOOLX) algo++; } // and unwanted ones... diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 0f1389f..e6c8ebc 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -251,6 +251,7 @@ + 76 @@ -463,9 +464,7 @@ 48 - - true - + @@ -476,15 +475,13 @@ 128 + 64 - - true - @@ -530,4 +527,4 @@ - + \ No newline at end of file diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index bfe4539..d427426 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -443,6 +443,9 @@ Source Files\CUDA\quark + + Source Files\CUDA\x11 + @@ -523,9 +526,9 @@ Source Files\CUDA - + Source Files\CUDA\x11 - + Source Files\CUDA\x11 @@ -544,6 +547,9 @@ Source Files\CUDA\x11 + + Source Files\CUDA\x11 + Source Files\CUDA\x11 @@ -553,9 +559,6 @@ Source Files\CUDA\x11 - - Source Files\CUDA\x11 - Source Files\CUDA\x13 @@ -707,4 +710,4 @@ Ressources - + \ No newline at end of file diff --git a/pools.cpp b/pools.cpp index fb295d7..51405ef 100644 --- a/pools.cpp +++ b/pools.cpp @@ -360,6 +360,7 @@ bool parse_pool_array(json_t *obj) void pool_dump_infos() { struct pool_infos *p; + if (opt_benchmark) return; for (int i=0; i>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); //MyStreamSynchronize(NULL, order, thr_id); } @@ -1465,8 +1466,8 @@ void x11_shavite512_cpu_init(int thr_id, uint32_t threads) __host__ void x11_shavite512_setBlock_80(void *pdata) { - // Message mit Padding bereitstellen - // lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen. + // Message with Padding + // The nonce is at Byte 76. unsigned char PaddedMessage[128]; memcpy(PaddedMessage, pdata, 80); memset(PaddedMessage+80, 0, 48); diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 797fa49..48958b2 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -5,11 +5,11 @@ // // STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations -#define TPB 128 - #include "miner.h" #include "cuda_helper.h" +#define TPB 128 + uint32_t *d_state[MAX_GPUS]; uint4 *d_temp4[MAX_GPUS]; @@ -34,7 +34,7 @@ const uint8_t h_perm[8][8] = { { 4, 5, 2, 3, 6, 7, 0, 1 } }; -/* for simd_functions.cu */ +/* for simd_functions.cuh */ #ifdef DEVICE_DIRECT_CONSTANTS __constant__ uint32_t c_IV_512[32] = { #else @@ -87,15 +87,23 @@ static const short h_FFT256_2_128_Twiddle[128] = { -30, 55, -58, -65, -95, -40, -98, 94 }; +#include "cuda_x11_simd512_sm2.cuh" -/************* the round function ****************/ +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#endif +/************* the round function ****************/ +#undef IF +#undef MAJ #define IF(x, y, z) (((y ^ z) & x) ^ z) #define MAJ(x, y, z) ((z &y) | ((z|y) & x)) +#include "x11/cuda_x11_simd512_func.cuh" -#include "x11/simd_functions.cu" +#if __CUDA_ARCH__ >= 300 /********************* Message expansion ************************/ @@ -347,7 +355,6 @@ void FFT_256_halfzero(int y[256]) FFT_128_full(y+16); } - /***************************************************/ __device__ __forceinline__ @@ -643,10 +650,23 @@ void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, } } +#else +__global__ void x11_simd512_gpu_expand_64(uint32_t threads, uint32_t *g_hash, uint4 *g_temp4) {} +__global__ void x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} +__global__ void x11_simd512_gpu_compress2_64(uint32_t threads, uint4 *g_fft4, uint32_t *g_state) {} +__global__ void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} +__global__ void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} +#endif /* SM3+ */ + __host__ int x11_simd512_cpu_init(int thr_id, uint32_t threads) { + int dev_id = device_map[thr_id]; cuda_get_arch(thr_id); + if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { + x11_simd512_cpu_init_sm2(thr_id); + return 0; + } CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */ CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err); @@ -656,10 +676,13 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads) cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); #endif + +#if 0 cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice); +#endif // Texture for 128-Bit Zugriffe cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc(); @@ -675,27 +698,31 @@ int x11_simd512_cpu_init(int thr_id, uint32_t threads) __host__ void x11_simd512_cpu_free(int thr_id) { - cudaFree(d_temp4[thr_id]); - cudaFree(d_state[thr_id]); + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) { + cudaFree(d_temp4[thr_id]); + cudaFree(d_state[thr_id]); + } } __host__ void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { const uint32_t threadsperblock = TPB; + int dev_id = device_map[thr_id]; dim3 block(threadsperblock); dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 gridX8(grid.x * 8); - if (d_nonceVector != NULL) { - applog(LOG_ERR, "Sorry, nonce Vector param was removed!"); + if (d_nonceVector != NULL || device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { + x11_simd512_cpu_hash_64_sm2(thr_id, threads, startNounce, d_nonceVector, d_hash, order); return; } x11_simd512_gpu_expand_64 <<>> (threads, d_hash, d_temp4[thr_id]); - if (device_sm[device_map[thr_id]] >= 500 && cuda_arch[device_map[thr_id]] >= 500) { + if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) { x11_simd512_gpu_compress_64_maxwell <<< grid, block >>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]); } else { x11_simd512_gpu_compress1_64 <<< grid, block >>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]); diff --git a/x11/simd_functions.cu b/x11/cuda_x11_simd512_func.cuh similarity index 98% rename from x11/simd_functions.cu rename to x11/cuda_x11_simd512_func.cuh index 6230e71..7128d61 100644 --- a/x11/simd_functions.cu +++ b/x11/cuda_x11_simd512_func.cuh @@ -1,3 +1,5 @@ +#define SIMD_FUNCTIONS_CUH + __device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) { uint32_t temp; @@ -1043,8 +1045,9 @@ __device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, cons A[j] = R[j]; } } -static __constant__ uint32_t d_cw0[8][8]; -static const uint32_t h_cw0[8][8] = { + +static __constant__ uint32_t d_cw0[8][8] = { +//static const uint32_t h_cw0[8][8] = { 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, @@ -1054,10 +1057,9 @@ static const uint32_t h_cw0[8][8] = { 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 }; -__device__ __forceinline__ void Round8_0_final(uint32_t *A, - int r, int s, int t, int u) { - +__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) +{ STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]); @@ -1067,8 +1069,9 @@ __device__ __forceinline__ void Round8_0_final(uint32_t *A, STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); } -static __constant__ uint32_t d_cw1[8][8]; -static const uint32_t h_cw1[8][8] = { + +static __constant__ uint32_t d_cw1[8][8] = { +//static const uint32_t h_cw1[8][8] = { 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, @@ -1078,10 +1081,9 @@ static const uint32_t h_cw1[8][8] = { 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 }; -__device__ __forceinline__ void Round8_1_final(uint32_t *A, - int r, int s, int t, int u) { - +__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) +{ STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]); @@ -1091,8 +1093,9 @@ __device__ __forceinline__ void Round8_1_final(uint32_t *A, STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); } -static __constant__ uint32_t d_cw2[8][8]; -static const uint32_t h_cw2[8][8] = { + +static __constant__ uint32_t d_cw2[8][8] = { +//static const uint32_t h_cw2[8][8] = { 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, @@ -1102,10 +1105,9 @@ static const uint32_t h_cw2[8][8] = { 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE }; -__device__ __forceinline__ void Round8_2_final(uint32_t *A, - int r, int s, int t, int u) { - +__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) +{ STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]); @@ -1115,8 +1117,9 @@ __device__ __forceinline__ void Round8_2_final(uint32_t *A, STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]); STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); } -static __constant__ uint32_t d_cw3[8][8]; -static const uint32_t h_cw3[8][8] = { + +static __constant__ uint32_t d_cw3[8][8] = { +//static const uint32_t h_cw3[8][8] = { 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, @@ -1126,10 +1129,9 @@ static const uint32_t h_cw3[8][8] = { 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D }; -__device__ __forceinline__ void Round8_3_final(uint32_t *A, - int r, int s, int t, int u) { - +__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) +{ STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); diff --git a/x11/cuda_x11_simd512_sm2.cuh b/x11/cuda_x11_simd512_sm2.cuh new file mode 100644 index 0000000..7ed927c --- /dev/null +++ b/x11/cuda_x11_simd512_sm2.cuh @@ -0,0 +1,579 @@ +#include "cuda_helper.h" + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 210 +#endif + +#if __CUDA_ARCH__ < 300 + +#define T32(x) (x) + +#ifndef DEVICE_DIRECT_CONSTANTS /* already made in SM 3+ implementation */ +__constant__ uint32_t c_IV_512[32]; +const uint32_t h_IV_512[32] = { + 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, + 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, + 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, + 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 +}; + +__constant__ int c_FFT128_8_16_Twiddle[128]; +static const int h_FFT128_8_16_Twiddle[128] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, + 1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, + 1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2,-123, 17,-111, + 1,-118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123,-122, + 1, 116, 92,-122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62, + 1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, + 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79 +}; + +__constant__ int c_FFT256_2_128_Twiddle[128]; +static const int h_FFT256_2_128_Twiddle[128] = { + 1, 41,-118, 45, 46, 87, -31, 14, + 60,-110, 116,-127, -67, 80, -61, 69, + 2, 82, 21, 90, 92, -83, -62, 28, + 120, 37, -25, 3, 123, -97,-122,-119, + 4, -93, 42, -77, -73, 91,-124, 56, + -17, 74, -50, 6, -11, 63, 13, 19, + 8, 71, 84, 103, 111, -75, 9, 112, + -34,-109,-100, 12, -22, 126, 26, 38, + 16,-115, -89, -51, -35, 107, 18, -33, + -68, 39, 57, 24, -44, -5, 52, 76, + 32, 27, 79,-102, -70, -43, 36, -66, + 121, 78, 114, 48, -88, -10, 104,-105, + 64, 54, -99, 53, 117, -86, 72, 125, + -15,-101, -29, 96, 81, -20, -49, 47, + 128, 108, 59, 106, -23, 85,-113, -7, + -30, 55, -58, -65, -95, -40, -98, 94 +}; +#endif + +__constant__ int c_FFT[256] = +//const int h_FFT[256] = +{ + // this is the FFT result in revbin permuted order + 4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92, + -119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63, + -63, 107, -107, -76, 76, -119, 119, -83, 83, 126, -126, 94, -94, -23, 23, -76, + 76, -47, 47, 92, -92, -117, 117, 73, -73, -53, 53, 88, -88, -80, 80, -47, 47, + 5, -5, 67, -67, 34, -34, 4, -4, 87, -87, -28, 28, -70, 70, -110, 110, -18, 18, 93, + -93, 51, -51, 36, -36, 118, -118, -106, 106, 45, -45, -108, 108, -44, 44, 117, + -117, -121, 121, -37, 37, 65, -65, 37, -37, 40, -40, -42, 42, 91, -91, -128, 128, + -21, 21, 94, -94, -98, 98, -47, 47, 28, -28, 115, -115, 16, -16, -20, 20, 122, + -122, 115, -115, 46, -46, 84, -84, -127, 127, 57, -57, 127, -127, -80, 80, 24, + -24, 15, -15, 29, -29, -78, 78, -126, 126, 16, -16, 52, -52, 55, -55, 110, -110, + -51, 51, -120, 120, -124, 124, -24, 24, -76, 76, 26, -26, -21, 21, -64, 64, -99, + 99, 85, -85, -15, 15, -120, 120, -116, 116, 85, -85, 12, -12, -24, 24, 4, -4, + 79, -79, 76, -76, 23, -23, 4, -4, -108, 108, -20, 20, 73, -73, -42, 42, -7, 7, + -29, 29, -123, 123, 49, -49, -96, 96, -68, 68, -112, 112, 116, -116, -24, 24, 93, + -93, -125, 125, -86, 86, 117, -117, -91, 91, 42, -42, 87, -87, -117, 117, 102, -102 +}; + +__constant__ int c_P8[32][8] = { +//static const int h_P8[32][8] = { + { 2, 66, 34, 98, 18, 82, 50, 114 }, + { 6, 70, 38, 102, 22, 86, 54, 118 }, + { 0, 64, 32, 96, 16, 80, 48, 112 }, + { 4, 68, 36, 100, 20, 84, 52, 116 }, + { 14, 78, 46, 110, 30, 94, 62, 126 }, + { 10, 74, 42, 106, 26, 90, 58, 122 }, + { 12, 76, 44, 108, 28, 92, 60, 124 }, + { 8, 72, 40, 104, 24, 88, 56, 120 }, + { 15, 79, 47, 111, 31, 95, 63, 127 }, + { 13, 77, 45, 109, 29, 93, 61, 125 }, + { 3, 67, 35, 99, 19, 83, 51, 115 }, + { 1, 65, 33, 97, 17, 81, 49, 113 }, + { 9, 73, 41, 105, 25, 89, 57, 121 }, + { 11, 75, 43, 107, 27, 91, 59, 123 }, + { 5, 69, 37, 101, 21, 85, 53, 117 }, + { 7, 71, 39, 103, 23, 87, 55, 119 }, + { 8, 72, 40, 104, 24, 88, 56, 120 }, + { 4, 68, 36, 100, 20, 84, 52, 116 }, + { 14, 78, 46, 110, 30, 94, 62, 126 }, + { 2, 66, 34, 98, 18, 82, 50, 114 }, + { 6, 70, 38, 102, 22, 86, 54, 118 }, + { 10, 74, 42, 106, 26, 90, 58, 122 }, + { 0, 64, 32, 96, 16, 80, 48, 112 }, + { 12, 76, 44, 108, 28, 92, 60, 124 }, + { 134, 198, 166, 230, 150, 214, 182, 246 }, + { 128, 192, 160, 224, 144, 208, 176, 240 }, + { 136, 200, 168, 232, 152, 216, 184, 248 }, + { 142, 206, 174, 238, 158, 222, 190, 254 }, + { 140, 204, 172, 236, 156, 220, 188, 252 }, + { 138, 202, 170, 234, 154, 218, 186, 250 }, + { 130, 194, 162, 226, 146, 210, 178, 242 }, + { 132, 196, 164, 228, 148, 212, 180, 244 }, +}; + +__constant__ int c_Q8[32][8] = { +//static const int h_Q8[32][8] = { + { 130, 194, 162, 226, 146, 210, 178, 242 }, + { 134, 198, 166, 230, 150, 214, 182, 246 }, + { 128, 192, 160, 224, 144, 208, 176, 240 }, + { 132, 196, 164, 228, 148, 212, 180, 244 }, + { 142, 206, 174, 238, 158, 222, 190, 254 }, + { 138, 202, 170, 234, 154, 218, 186, 250 }, + { 140, 204, 172, 236, 156, 220, 188, 252 }, + { 136, 200, 168, 232, 152, 216, 184, 248 }, + { 143, 207, 175, 239, 159, 223, 191, 255 }, + { 141, 205, 173, 237, 157, 221, 189, 253 }, + { 131, 195, 163, 227, 147, 211, 179, 243 }, + { 129, 193, 161, 225, 145, 209, 177, 241 }, + { 137, 201, 169, 233, 153, 217, 185, 249 }, + { 139, 203, 171, 235, 155, 219, 187, 251 }, + { 133, 197, 165, 229, 149, 213, 181, 245 }, + { 135, 199, 167, 231, 151, 215, 183, 247 }, + { 9, 73, 41, 105, 25, 89, 57, 121 }, + { 5, 69, 37, 101, 21, 85, 53, 117 }, + { 15, 79, 47, 111, 31, 95, 63, 127 }, + { 3, 67, 35, 99, 19, 83, 51, 115 }, + { 7, 71, 39, 103, 23, 87, 55, 119 }, + { 11, 75, 43, 107, 27, 91, 59, 123 }, + { 1, 65, 33, 97, 17, 81, 49, 113 }, + { 13, 77, 45, 109, 29, 93, 61, 125 }, + { 135, 199, 167, 231, 151, 215, 183, 247 }, + { 129, 193, 161, 225, 145, 209, 177, 241 }, + { 137, 201, 169, 233, 153, 217, 185, 249 }, + { 143, 207, 175, 239, 159, 223, 191, 255 }, + { 141, 205, 173, 237, 157, 221, 189, 253 }, + { 139, 203, 171, 235, 155, 219, 187, 251 }, + { 131, 195, 163, 227, 147, 211, 179, 243 }, + { 133, 197, 165, 229, 149, 213, 181, 245 }, +}; + +#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \ + ((x)%7) == 1 ? 6 : \ + ((x)%7) == 2 ? 2 : \ + ((x)%7) == 3 ? 3 : \ + ((x)%7) == 4 ? 5 : \ + ((x)%7) == 5 ? 7 : 4 ) + +/************* the round function ****************/ + +#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x))) + +__device__ __forceinline__ +void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) +{ + uint32_t R[8]; + #pragma unroll 8 + for(int j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + #pragma unroll 8 + for(int j=0; j<8; j++) { + D[j] = D[j] + w[j] + IF(A[j], B[j], C[j]); + D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]); + A[j] = R[j]; + } +} + +__device__ __forceinline__ +void STEP8_MAJ(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D) +{ + uint32_t R[8]; + #pragma unroll 8 + for(int j=0; j<8; j++) { + R[j] = ROTL32(A[j], r); + } + #pragma unroll 8 + for(int j=0; j<8; j++) { + D[j] = D[j] + w[j] + MAJ(A[j], B[j], C[j]); + D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]); + A[j] = R[j]; + } +} + +__device__ __forceinline__ +void Round8(uint32_t A[32], const int y[256], int i, int r, int s, int t, int u) +{ + uint32_t w[8][8]; + int code = i<2? 185: 233; + int a, b; + + /* + * The FFT output y is in revbin permuted order, + * but this is included in the tables P and Q + */ + + #pragma unroll 8 + for(a=0; a<8; a++) { + #pragma unroll 8 + for(b=0; b<8; b++) { + w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410); + } + } + + STEP8_IF(w[0], 8*i+0, r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF(w[1], 8*i+1, s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF(w[2], 8*i+2, t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF(w[3], 8*i+3, u, r, &A[8], &A[16], &A[24], A); + + STEP8_MAJ(w[4], 8*i+4, r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ(w[5], 8*i+5, s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ(w[6], 8*i+6, t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ(w[7], 8*i+7, u, r, &A[8], &A[16], &A[24], A); +} + + +/********************* Message expansion ************************/ + +/* + * Reduce modulo 257; result is in [-127; 383] + * REDUCE(x) := (x&255) - (x>>8) + */ +#define REDUCE(x) (((x)&255) - ((x)>>8)) + +/* + * Reduce from [-127; 383] to [-128; 128] + * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257 + */ +#define EXTRA_REDUCE_S(x) \ + ((x)<=128 ? (x) : (x)-257) + +/* + * Reduce modulo 257; result is in [-128; 128] + */ +#define REDUCE_FULL_S(x) \ + EXTRA_REDUCE_S(REDUCE(x)) + +__device__ __forceinline__ +void FFT_8(int *y, int stripe) +{ + /* + * FFT_8 using w=4 as 8th root of unity + * Unrolled decimation in frequency (DIF) radix-2 NTT. + * Output data is in revbin_permuted order. + */ +#define X(i) y[stripe*i] + +#define DO_REDUCE(i) \ + X(i) = REDUCE(X(i)) + +#define DO_REDUCE_FULL_S(i) do { \ + X(i) = REDUCE(X(i)); \ + X(i) = EXTRA_REDUCE_S(X(i)); \ +} while(0) + +#define BUTTERFLY(i,j,n) do { \ + int u= X(i); \ + int v= X(j); \ + X(i) = u+v; \ + X(j) = (u-v) << (2*n); \ +} while(0) + + BUTTERFLY(0, 4, 0); + BUTTERFLY(1, 5, 1); + BUTTERFLY(2, 6, 2); + BUTTERFLY(3, 7, 3); + + DO_REDUCE(6); + DO_REDUCE(7); + + BUTTERFLY(0, 2, 0); + BUTTERFLY(4, 6, 0); + BUTTERFLY(1, 3, 2); + BUTTERFLY(5, 7, 2); + + DO_REDUCE(7); + + BUTTERFLY(0, 1, 0); + BUTTERFLY(2, 3, 0); + BUTTERFLY(4, 5, 0); + BUTTERFLY(6, 7, 0); + + DO_REDUCE_FULL_S(0); + DO_REDUCE_FULL_S(1); + DO_REDUCE_FULL_S(2); + DO_REDUCE_FULL_S(3); + DO_REDUCE_FULL_S(4); + DO_REDUCE_FULL_S(5); + DO_REDUCE_FULL_S(6); + DO_REDUCE_FULL_S(7); + +#undef X +#undef DO_REDUCE +#undef DO_REDUCE_FULL_S +#undef BUTTERFLY +} + +__device__ __forceinline__ +void FFT_16(int *y, int stripe) +{ + /* + * FFT_16 using w=2 as 16th root of unity + * Unrolled decimation in frequency (DIF) radix-2 NTT. + * Output data is in revbin_permuted order. + */ + + #define X(i) y[stripe*i] + + #define DO_REDUCE(i) \ + X(i) = REDUCE(X(i)) + + #define DO_REDUCE_FULL_S(i) \ + do { \ + X(i) = REDUCE(X(i)); \ + X(i) = EXTRA_REDUCE_S(X(i)); \ + } while(0) + + #define BUTTERFLY(i,j,n) \ + do { \ + int u= X(i); \ + int v= X(j); \ + X(i) = u+v; \ + X(j) = (u-v) << n; \ + } while(0) + + BUTTERFLY(0, 8, 0); + BUTTERFLY(1, 9, 1); + BUTTERFLY(2, 10, 2); + BUTTERFLY(3, 11, 3); + BUTTERFLY(4, 12, 4); + BUTTERFLY(5, 13, 5); + BUTTERFLY(6, 14, 6); + BUTTERFLY(7, 15, 7); + + DO_REDUCE(11); + DO_REDUCE(12); + DO_REDUCE(13); + DO_REDUCE(14); + DO_REDUCE(15); + + BUTTERFLY( 0, 4, 0); + BUTTERFLY( 1, 5, 2); + BUTTERFLY( 2, 6, 4); + BUTTERFLY( 3, 7, 6); + + BUTTERFLY( 8, 12, 0); + BUTTERFLY( 9, 13, 2); + BUTTERFLY(10, 14, 4); + BUTTERFLY(11, 15, 6); + + DO_REDUCE(5); + DO_REDUCE(7); + DO_REDUCE(13); + DO_REDUCE(15); + + BUTTERFLY( 0, 2, 0); + BUTTERFLY( 1, 3, 4); + BUTTERFLY( 4, 6, 0); + BUTTERFLY( 5, 7, 4); + + BUTTERFLY( 8, 10, 0); + BUTTERFLY(12, 14, 0); + BUTTERFLY( 9, 11, 4); + BUTTERFLY(13, 15, 4); + + BUTTERFLY( 0, 1, 0); + BUTTERFLY( 2, 3, 0); + BUTTERFLY( 4, 5, 0); + BUTTERFLY( 6, 7, 0); + + BUTTERFLY( 8, 9, 0); + BUTTERFLY(10, 11, 0); + BUTTERFLY(12, 13, 0); + BUTTERFLY(14, 15, 0); + + DO_REDUCE_FULL_S( 0); + DO_REDUCE_FULL_S( 1); + DO_REDUCE_FULL_S( 2); + DO_REDUCE_FULL_S( 3); + DO_REDUCE_FULL_S( 4); + DO_REDUCE_FULL_S( 5); + DO_REDUCE_FULL_S( 6); + DO_REDUCE_FULL_S( 7); + DO_REDUCE_FULL_S( 8); + DO_REDUCE_FULL_S( 9); + DO_REDUCE_FULL_S(10); + DO_REDUCE_FULL_S(11); + DO_REDUCE_FULL_S(12); + DO_REDUCE_FULL_S(13); + DO_REDUCE_FULL_S(14); + DO_REDUCE_FULL_S(15); + +#undef X +#undef DO_REDUCE +#undef DO_REDUCE_FULL_S +#undef BUTTERFLY +} + +__device__ __forceinline__ +void FFT_128_full(int *y) +{ + #pragma unroll 16 + for (int i=0; i<16; i++) { + FFT_8(y+i,16); + } + + #pragma unroll 128 + for (int i=0; i<128; i++) + /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i]); + + #pragma unroll 8 + for (int i=0; i<8; i++) { + FFT_16(y+16*i,1); + } +} + +__device__ __forceinline__ +void FFT_256_halfzero(int y[256]) +{ + /* + * FFT_256 using w=41 as 256th root of unity. + * Decimation in frequency (DIF) NTT. + * Output data is in revbin_permuted order. + * In place. + */ + const int tmp = y[127]; + + #pragma unroll 127 + for (int i=0; i<127; i++) + y[128+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[i]); + + /* handle X^255 with an additionnal butterfly */ + y[127] = REDUCE(tmp + 1); + y[255] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]); + + FFT_128_full(y); + FFT_128_full(y+128); +} + +__device__ __forceinline__ +void SIMD_Compress(uint32_t A[32], const int *expanded, const uint32_t *M) +{ + uint32_t IV[4][8]; + + /* Save the chaining value for the feed-forward */ + + #pragma unroll 8 + for(int i=0; i<8; i++) { + IV[0][i] = A[i]; + IV[1][i] = (&A[8])[i]; + IV[2][i] = (&A[16])[i]; + IV[3][i] = (&A[24])[i]; + } + + /* XOR the message to the chaining value */ + /* we can XOR word-by-word */ + #pragma unroll 8 + for(int i=0; i<8; i++) { + A[i] ^= M[i]; + (&A[8])[i] ^= M[8+i]; + } + + /* Run the feistel ladders with the expanded message */ + Round8(A, expanded, 0, 3, 23, 17, 27); + Round8(A, expanded, 1, 28, 19, 22, 7); + Round8(A, expanded, 2, 29, 9, 15, 5); + Round8(A, expanded, 3, 4, 13, 10, 25); + + STEP8_IF(IV[0], 32, 4, 13, A, &A[8], &A[16], &A[24]); + STEP8_IF(IV[1], 33, 13, 10, &A[24], A, &A[8], &A[16]); + STEP8_IF(IV[2], 34, 10, 25, &A[16], &A[24], A, &A[8]); + STEP8_IF(IV[3], 35, 25, 4, &A[8], &A[16], &A[24], A); +} + + +/***************************************************/ + +__device__ __forceinline__ +void SIMDHash(const uint32_t *data, uint32_t *hashval) +{ + uint32_t A[32]; + uint32_t buffer[16]; + + #pragma unroll 32 + for (int i=0; i < 32; i++) A[i] = c_IV_512[i]; + + #pragma unroll 16 + for (int i=0; i < 16; i++) buffer[i] = data[i]; + + /* Message Expansion using Number Theoretical Transform similar to FFT */ + int expanded[256]; + { + #pragma unroll 16 + for(int i=0; i<64; i+=4) { + expanded[i+0] = __byte_perm(buffer[i/4],0,0x4440); + expanded[i+1] = __byte_perm(buffer[i/4],0,0x4441); + expanded[i+2] = __byte_perm(buffer[i/4],0,0x4442); + expanded[i+3] = __byte_perm(buffer[i/4],0,0x4443); + } + + #pragma unroll 16 + for(int i=64; i<128; i+=4) { + expanded[i+0] = 0; + expanded[i+1] = 0; + expanded[i+2] = 0; + expanded[i+3] = 0; + } + + FFT_256_halfzero(expanded); + } + + /* Compression Function */ + SIMD_Compress(A, expanded, buffer); + + /* Padding Round with known input (hence the FFT can be precomputed) */ + buffer[0] = 512; + + #pragma unroll 15 + for (int i=1; i < 16; i++) buffer[i] = 0; + + SIMD_Compress(A, c_FFT, buffer); + + #pragma unroll 16 + for (int i=0; i < 16; i++) + hashval[i] = A[i]; +} + +/***************************************************/ +__global__ +void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + + const int hashPosition = nounce - startNounce; + uint32_t *Hash = (uint32_t*) &g_hash[8 * hashPosition]; + + SIMDHash(Hash, Hash); + } +} + +#else +__global__ void x11_simd512_gpu_hash_64_sm2(const uint32_t threads, const uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) {} +#endif /* __CUDA_ARCH__ */ + +__host__ +static void x11_simd512_cpu_init_sm2(int thr_id) +{ +#ifndef DEVICE_DIRECT_CONSTANTS + cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); +#endif +// cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice); +// cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice); +// cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice); +} + +__host__ +static void x11_simd512_cpu_hash_64_sm2(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +{ + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + size_t shared_size = 0; + + x11_simd512_gpu_hash_64_sm2<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + MyStreamSynchronize(NULL, order, thr_id); +} \ No newline at end of file diff --git a/x11/s3.cu b/x11/s3.cu index 4f47ddf..45609c8 100644 --- a/x11/s3.cu +++ b/x11/s3.cu @@ -43,6 +43,21 @@ extern "C" void s3hash(void *output, const void *input) memcpy(output, hash, 32); } +#ifdef _DEBUG +#define TRACE(algo) { \ + if (max_nonce == 1 && pdata[19] <= 1) { \ + uint32_t* debugbuf = NULL; \ + cudaMallocHost(&debugbuf, 32); \ + cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \ + printf("S3 %s %08x %08x %08x %08x...%08x\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ + swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \ + cudaFreeHost(debugbuf); \ + } \ +} +#else +#define TRACE(algo) {} +#endif + static bool init[MAX_GPUS] = { 0 }; /* Main S3 entry point */ @@ -60,18 +75,24 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0xF; + ptarget[7] = 0xF; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); x11_shavite512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); - cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; @@ -90,8 +111,11 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un int order = 0; x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("shavite:"); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("simd :"); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); *hashes_done = pdata[19] - first_nonce + throughput; diff --git a/x11/x11.cu b/x11/x11.cu index c4d7609..d2e93ca 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -114,11 +114,17 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x5; + ptarget[7] = 0x5; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } quark_blake512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput);