From 3d03a1b9fd0a14c65f231ed65f9cebb267e63a4f Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Mon, 28 May 2018 15:21:00 +0200 Subject: [PATCH] phi2 algo --- Makefile.am | 2 +- algos.h | 2 + ccminer.cpp | 8 +- ccminer.vcxproj | 5 +- ccminer.vcxproj.filters | 20 ++- lyra2/cuda_lyra2.cu | 122 ++++++++++++++--- lyra2/cuda_lyra2_sm2.cuh | 65 ++++++++- lyra2/cuda_lyra2_sm5.cuh | 64 ++++++++- miner.h | 5 +- phi/cuda_phi2.cu | 89 ++++++++++++ {x11 => phi}/phi.cu | 8 +- phi/phi2.cu | 255 +++++++++++++++++++++++++++++++++++ util.cpp | 2 +- x11/cuda_streebog_maxwell.cu | 21 ++- x16/cuda_x16_echo512_64.cu | 26 +++- 15 files changed, 648 insertions(+), 46 deletions(-) create mode 100644 phi/cuda_phi2.cu rename {x11 => phi}/phi.cu (97%) create mode 100644 phi/phi2.cu diff --git a/Makefile.am b/Makefile.am index 8f33d48..5d5652c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -81,7 +81,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \ x16/cuda_x16_echo512_64.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ - x11/phi.cu x11/cuda_streebog_maxwell.cu \ + phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu # scrypt diff --git a/algos.h b/algos.h index f141086..229d8e9 100644 --- a/algos.h +++ b/algos.h @@ -39,6 +39,7 @@ enum sha_algos { ALGO_NIST5, ALGO_PENTABLAKE, ALGO_PHI, + ALGO_PHI2, ALGO_POLYTIMOS, ALGO_QUARK, ALGO_QUBIT, @@ -112,6 +113,7 @@ static const char *algo_names[] = { "nist5", "penta", "phi", + "phi2", "polytimos", "quark", "qubit", diff --git a/ccminer.cpp b/ccminer.cpp index a48b194..c1567a1 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -269,7 +269,8 @@ Options:\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ - phi BHCoin\n\ + phi LUX initial algo\n\ + phi2 LUX v2 with lyra2\n\ polytimos Politimos\n\ quark Quark\n\ qubit Qubit\n\ @@ -1708,6 +1709,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_LBRY: case ALGO_LYRA2v2: case ALGO_LYRA2Z: + case ALGO_PHI2: case ALGO_TIMETRAVEL: case ALGO_BITCORE: case ALGO_X16R: @@ -2245,6 +2247,7 @@ static void *miner_thread(void *userdata) case ALGO_HSR: case ALGO_LYRA2v2: case ALGO_PHI: + case ALGO_PHI2: case ALGO_POLYTIMOS: case ALGO_S3: case ALGO_SKUNK: @@ -2436,6 +2439,9 @@ static void *miner_thread(void *userdata) case ALGO_PHI: rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_PHI2: + rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_POLYTIMOS: rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 1db063e..f20449a 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -525,6 +525,7 @@ + @@ -537,6 +538,9 @@ 48 + + + compute_50,sm_50;compute_52,sm_52 @@ -567,7 +571,6 @@ - diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index b2ee453..96220ae 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -115,7 +115,10 @@ {1e548d79-c217-4203-989a-a592fe2b2de3} - + + {311e8d79-1612-4f0f-8591-23a592f2b2d3} + + {xde48d89-fx12-1323-129a-b592fe2b2de3} @@ -545,6 +548,9 @@ Source Files\CUDA\lyra2 + + Source Files\CUDA\lyra2 + Source Files\CUDA\lyra2 @@ -781,6 +787,15 @@ Source Files\CUDA + + Source Files\CUDA\phi + + + Source Files\CUDA\phi + + + Source Files\CUDA\phi + Source Files\CUDA\skunk @@ -799,9 +814,6 @@ Source Files\CUDA\tribus - - Source Files\CUDA\x11 - Source Files\CUDA\x11 diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index a280200..5cdb6ee 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -1,6 +1,7 @@ /** * Lyra2 (v1) cuda implementation based on djm34 work * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2) + * tpruvot@github 2018 for phi2 double lyra2-32 support */ #include @@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads) { uint2 state1[3]; -#if __CUDA_ARCH__ > 500 -#pragma unroll -#endif + #pragma unroll for (int i = 0; i < Nrow; i++) { ST4S(0, Ncol - i - 1, state, thread, threads); @@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin LD4S(state1, rowIn, i, thread, threads); LD4S(state2, rowInOut, i, thread, threads); -#pragma unroll + #pragma unroll for (int j = 0; j < 3; j++) state[j] ^= state1[j] + state2[j]; @@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin LD4S(state1, rowOut, i, thread, threads); -#pragma unroll + #pragma unroll for (int j = 0; j < 3; j++) state1[j] ^= state[j]; @@ -412,11 +411,9 @@ __global__ __launch_bounds__(64, 1) void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) { uint2x4 state[4]; - state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); @@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) __global__ __launch_bounds__(TPB52, 1) -void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) +void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash) { const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; - if (thread < threads) { uint2 state[4]; @@ -484,11 +480,9 @@ __global__ __launch_bounds__(64, 1) void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) { const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; - - uint28 state[4]; - if (thread < threads) { + uint2x4 state[4]; state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); @@ -501,7 +495,57 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) g_hash[thread + threads * 1] = state[0].y; g_hash[thread + threads * 2] = state[0].z; g_hash[thread + threads * 3] = state[0].w; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2x4 state[4]; + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *psrc = (uint2*)(&d_hash_512[offset]); + state[0].x = state[1].x = __ldg(&psrc[0]); + state[0].y = state[1].y = __ldg(&psrc[1]); + state[0].z = state[1].z = __ldg(&psrc[2]); + state[0].w = state[1].w = __ldg(&psrc[3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + for (int i = 0; i<24; i++) + round_lyra(state); + + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) +{ + // This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + uint2x4 state[4]; + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *pdst = (uint2*)(&d_hash_512[offset]); + pdst[0] = state[0].x; + pdst[1] = state[0].y; + pdst[2] = state[0].z; + pdst[3] = state[0].w; } } #else @@ -513,6 +557,8 @@ __device__ void* DMatrix; __global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {} __global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {} +__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {} #endif __host__ @@ -545,9 +591,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7 if (cuda_arch[dev_id] >= 520) { lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash); - lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash); } else if (cuda_arch[dev_id] >= 500) @@ -562,11 +606,57 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7 shared_mem = 6144; lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash); - lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); } else lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash); } + +__host__ +void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti) +{ + int dev_id = device_map[thr_id % MAX_GPUS]; + uint32_t tpb = TPB52; + if (cuda_arch[dev_id] >= 520) tpb = TPB52; + else if (cuda_arch[dev_id] >= 500) tpb = TPB50; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; + + dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); + + dim3 grid2((threads + 64 - 1) / 64); + dim3 block2(64); + + if (cuda_arch[dev_id] >= 520) + { + const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152; + lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256); + lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + + lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256); + lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + } + else if (cuda_arch[dev_id] >= 500) + { + size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps + lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256); + lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + + lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256); + lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + } + else { + // alternative method for SM 3.x + hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0); + lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti); + hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0); + hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1); + lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti); + hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1); + } +} diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh index da621d0..cc0bd82 100644 --- a/lyra2/cuda_lyra2_sm2.cuh +++ b/lyra2/cuda_lyra2_sm2.cuh @@ -3,7 +3,7 @@ #ifdef __INTELLISENSE__ /* just for vstudio code colors, only uncomment that temporary, dont commit it */ //#undef __CUDA_ARCH__ -//#define __CUDA_ARCH__ 500 +//#define __CUDA_ARCH__ 300 #endif #include "cuda_helper.h" @@ -226,3 +226,66 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) /* if __CUDA_ARCH__ < 200 .. host */ __global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {} #endif + +// ------------------------------------------------------------------------------------------------------------------------- + +// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo... + +#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350 + +__global__ __launch_bounds__(128, 8) +void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const size_t offset = (size_t) 16 * thread + (round * 8U); + uint2 *psrc = (uint2*) (&d_hash64[offset]); + uint2 *pdst = (uint2*) (&d_hash_lyra[thread]); + pdst[threads*0] = __ldg(&psrc[0]); + pdst[threads*1] = __ldg(&psrc[1]); + pdst[threads*2] = __ldg(&psrc[2]); + pdst[threads*3] = __ldg(&psrc[3]); + } +} + +__global__ __launch_bounds__(128, 8) +void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const size_t offset = (size_t) 16 * thread + (round * 8U); + uint2 *psrc = (uint2*) (&d_hash_lyra[thread]); + uint2 *pdst = (uint2*) (&d_hash64[offset]); + pdst[0] = psrc[0]; + pdst[1] = psrc[threads*1]; + pdst[2] = psrc[threads*2]; + pdst[3] = psrc[threads*3]; + } +} +#else +/* if __CUDA_ARCH__ < 200 .. host */ +__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {} +__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {} +#endif + +__host__ +void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + hash64_to_lyra32_gpu <<>> (threads, d_hash64, (uint2*) d_hash_lyra, round); +} + +__host__ +void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + hash64_from_lyra32_gpu <<>> (threads, d_hash64, (uint2*) d_hash_lyra, round); +} diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh index 4a3caeb..85adfd9 100644 --- a/lyra2/cuda_lyra2_sm5.cuh +++ b/lyra2/cuda_lyra2_sm5.cuh @@ -591,13 +591,12 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr __global__ __launch_bounds__(64, 1) void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - const uint2x4 blake2b_IV[2] = { { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } }; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint2x4 state[4]; @@ -629,7 +628,6 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) if (thread < threads) { uint2 state[4]; - state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]); state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]); state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]); @@ -669,7 +667,6 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) if (thread < threads) { uint2x4 state[4]; - state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]); state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]); state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]); @@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) } } +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) +{ + const uint2x4 blake2b_IV[2] = { + { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, + { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } + }; + // This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2x4 state[4]; + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *psrc = (uint2*)(&d_hash_512[offset]); + state[0].x = state[1].x = __ldg(&psrc[0]); + state[0].y = state[1].y = __ldg(&psrc[1]); + state[0].z = state[1].z = __ldg(&psrc[2]); + state[0].w = state[1].w = __ldg(&psrc[3]); + + state[1] = state[0]; + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i<24; i++) + round_lyra(state); + + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) +{ + // This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + uint2x4 state[4]; + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *pdst = (uint2*)(&d_hash_512[offset]); + pdst[0] = state[0].x; + pdst[1] = state[0].y; + pdst[2] = state[0].z; + pdst[3] = state[0].w; + } +} #else /* if __CUDA_ARCH__ != 500 .. host */ __global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {} +__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {} #endif diff --git a/miner.h b/miner.h index 16f57ab..d3118dc 100644 --- a/miner.h +++ b/miner.h @@ -303,6 +303,7 @@ extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -371,6 +372,7 @@ extern void free_neoscrypt(int thr_id); extern void free_nist5(int thr_id); extern void free_pentablake(int thr_id); extern void free_phi(int thr_id); +extern void free_phi2(int thr_id); extern void free_polytimos(int thr_id); extern void free_quark(int thr_id); extern void free_qubit(int thr_id); @@ -918,7 +920,8 @@ void myriadhash(void *state, const void *input); void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void nist5hash(void *state, const void *input); void pentablakehash(void *output, const void *input); -void phihash(void *output, const void *input); +void phi_hash(void *output, const void *input); +void phi2_hash(void *output, const void *input); void polytimos_hash(void *output, const void *input); void quarkhash(void *state, const void *input); void qubithash(void *state, const void *input); diff --git a/phi/cuda_phi2.cu b/phi/cuda_phi2.cu new file mode 100644 index 0000000..a0bcf6d --- /dev/null +++ b/phi/cuda_phi2.cu @@ -0,0 +1,89 @@ +#include +#include + +#include "cuda_helper.h" + +__global__ __launch_bounds__(128, 8) +void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t); + uint4 *psrc = (uint4*) (&d_hash[offset]); + d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1; + if (d_NonceBranch[thread]) return; + if (d_branch2) { + uint4 *pdst = (uint4*)(&d_branch2[offset]); + uint4 data; + data = psrc[0]; pdst[0] = data; + data = psrc[1]; pdst[1] = data; + data = psrc[2]; pdst[2] = data; + data = psrc[3]; pdst[3] = data; + } + } +} + +__global__ __launch_bounds__(128, 8) +void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads && !d_NonceBranch[thread]) + { + const uint32_t offset = thread * 16U; + uint4 *psrc = (uint4*) (&d_branch2[offset]); + uint4 *pdst = (uint4*) (&d_hash[offset]); + uint4 data; + data = psrc[0]; pdst[0] = data; + data = psrc[1]; pdst[1] = data; + data = psrc[2]; pdst[2] = data; + data = psrc[3]; pdst[3] = data; + } +} + +__global__ +void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 16U; + uint2 *psrc = (uint2*) (&d_hash[offset]); + uint2 *pdst = (uint2*) (&d_hash[offset]); + uint2 data; + data = psrc[4]; pdst[0] ^= data; + data = psrc[5]; pdst[1] ^= data; + data = psrc[6]; pdst[2] ^= data; + data = psrc[7]; pdst[3] ^= data; + } +} + +__host__ +uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // extract algo permution hashes to a second branch buffer + phi_filter_gpu <<>> (threads, inpHashes, d_br2, d_nonces); + return threads; +} + +__host__ +void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // put back second branch hashes to the common buffer d_hash + phi_merge_gpu <<>> (threads, outpHashes, d_br2, d_nonces); +} + +__host__ +void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + phi_final_compress_gpu <<>> (threads, d_hashes); +} diff --git a/x11/phi.cu b/phi/phi.cu similarity index 97% rename from x11/phi.cu rename to phi/phi.cu index ab1f308..ba2a967 100644 --- a/x11/phi.cu +++ b/phi/phi.cu @@ -19,7 +19,7 @@ extern "C" { #include "miner.h" #include "cuda_helper.h" -#include "cuda_x11.h" +#include "x11/cuda_x11.h" extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap); @@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_resNonce[MAX_GPUS]; -extern "C" void phihash(void *output, const void *input) +extern "C" void phi_hash(void *output, const void *input) { unsigned char _ALIGN(128) hash[128] = { 0 }; @@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u uint32_t _ALIGN(64) vhash[8]; if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce; be32enc(&endiandata[19], work->nonces[0]); - phihash(vhash, endiandata); + phi_hash(vhash, endiandata); if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { work->valid_nonces = 1; @@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u if (work->nonces[1] != UINT32_MAX) { work->nonces[1] += startNonce; be32enc(&endiandata[19], work->nonces[1]); - phihash(vhash, endiandata); + phi_hash(vhash, endiandata); bn_set_target_ratio(work, vhash, 1); work->valid_nonces++; pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; diff --git a/phi/phi2.cu b/phi/phi2.cu new file mode 100644 index 0000000..537217f --- /dev/null +++ b/phi/phi2.cu @@ -0,0 +1,255 @@ +// +// PHI2 algo +// CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein +// +// Implemented by tpruvot in May 2018 +// + +extern "C" { +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_streebog.h" +#include "sph/sph_echo.h" +#include "lyra2/Lyra2.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "x11/cuda_x11.h" + +#include +#include + +extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata); +extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); + +extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); +extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti); + +extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter); +extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter); + +extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces); +extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces); +extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes); + +static uint64_t* d_matrix[MAX_GPUS]; +static uint32_t* d_hash_512[MAX_GPUS]; +static uint64_t* d_hash_256[MAX_GPUS]; +static uint32_t* d_hash_br2[MAX_GPUS]; +static uint32_t* d_nonce_br[MAX_GPUS]; + +extern "C" void phi2_hash(void *output, const void *input) +{ + unsigned char _ALIGN(128) hash[128] = { 0 }; + unsigned char _ALIGN(128) hashA[64] = { 0 }; + unsigned char _ALIGN(128) hashB[64] = { 0 }; + + sph_cubehash512_context ctx_cubehash; + sph_jh512_context ctx_jh; + sph_gost512_context ctx_gost; + sph_echo512_context ctx_echo; + sph_skein512_context ctx_skein; + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, input, 80); + sph_cubehash512_close(&ctx_cubehash, (void*)hashB); + + LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8); + LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8); + + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, (const void*)hashA, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + if (hash[0] & 1) { + sph_gost512_init(&ctx_gost); + sph_gost512(&ctx_gost, (const void*)hash, 64); + sph_gost512_close(&ctx_gost, (void*)hash); + } else { + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + } + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, (const void*)hash, 64); + sph_skein512_close(&ctx_skein, (void*)hash); + + for (int i=0; i<32; i++) + hash[i] ^= hash[i+32]; + + memcpy(output, hash, 32); +} + +//#define _DEBUG +#define _DEBUG_PREFIX "phi-" +#include "cuda_debug.cuh" + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; +static __thread bool gtx750ti = false; + +extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16; + if (device_sm[dev_id] == 500) intensity = 15; + if (device_sm[dev_id] == 600) intensity = 17; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem + + if (opt_benchmark) + ptarget[7] = 0xff; + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL); + + size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4; + CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1); + if (use_compat_kernels[thr_id]) { + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1); + } + + x11_cubehash512_cpu_init(thr_id, throughput); + lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + quark_jh512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput); + + cuda_check_cpu_init(thr_id, throughput); + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k = 0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + cuda_check_cpu_setTarget(ptarget); + cubehash512_setBlock_80(thr_id, endiandata); + + do { + int order = 0; + + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); order++; + TRACE("cube "); + + lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti); + order++; + TRACE("lyra "); + + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++); + TRACE("jh "); + + order++; + if (!use_compat_kernels[thr_id]) { + phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]); + phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); + phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); + phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]); + } else { + // todo: nonces vector to reduce amount of hashes to compute + phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]); + streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order); + phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]); + } + TRACE("mix "); + + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++); + TRACE("skein "); + + phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]); + TRACE("xor "); + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]); + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + phi2_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + *hashes_done = pdata[19] - first_nonce + throughput; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + phi2_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + if (pdata[19] > max_nonce) pdata[19] = max_nonce; + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_phi2(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + cudaFree(d_matrix[thr_id]); + cudaFree(d_hash_512[thr_id]); + cudaFree(d_hash_256[thr_id]); + cudaFree(d_nonce_br[thr_id]); + if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]); + + cuda_check_cpu_free(thr_id); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/util.cpp b/util.cpp index 70dc626..ee1c1ee 100644 --- a/util.cpp +++ b/util.cpp @@ -2250,7 +2250,7 @@ void print_hash_tests(void) pentablakehash(&hash[0], &buf[0]); printpfx("pentablake", hash); - phihash(&hash[0], &buf[0]); + phi2_hash(&hash[0], &buf[0]); printpfx("phi", hash); polytimos_hash(&hash[0], &buf[0]); diff --git a/x11/cuda_streebog_maxwell.cu b/x11/cuda_streebog_maxwell.cu index 6a06332..4ff580b 100644 --- a/x11/cuda_streebog_maxwell.cu +++ b/x11/cuda_streebog_maxwell.cu @@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3) #else __launch_bounds__(TPB, 3) #endif -void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) +void streebog_gpu_hash_64_sm5(uint64_t *g_hash, uint32_t* const d_filter, const uint32_t filter_val) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint2 buf[8], t[8], temp[8], K0[8], hash[8]; @@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]); shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]); + //__threadfence_block(); + __syncthreads(); + + if (d_filter && d_filter[thread] != filter_val) return; + uint64_t* inout = &g_hash[thread<<3]; *(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]); *(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]); - __threadfence_block(); - K0[0] = vectorize(0x74a5d4ce2efc83b3); #pragma unroll 8 @@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) } __host__ -void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash) +void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *g_hash) +{ + dim3 grid((threads + TPB-1) / TPB); + dim3 block(TPB); + streebog_gpu_hash_64_sm5 <<>> ((uint64_t*)g_hash, NULL, 0); +} + +__host__ +void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter) { dim3 grid((threads + TPB-1) / TPB); dim3 block(TPB); - streebog_gpu_hash_64_maxwell <<>> ((uint64_t*)d_hash); + streebog_gpu_hash_64_sm5 <<>> ((uint64_t*)g_hash, d_filter, 1); } diff --git a/x16/cuda_x16_echo512_64.cu b/x16/cuda_x16_echo512_64.cu index ac18ff6..3a0f268 100644 --- a/x16/cuda_x16_echo512_64.cu +++ b/x16/cuda_x16_echo512_64.cu @@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, } __global__ __launch_bounds__(128, 5) /* will force 80 registers */ -static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) +static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val) { __shared__ uint32_t sharedMemory[4][256]; aes_gpu_init128(sharedMemory); + __syncthreads(); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t k0; @@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) uint32_t hash[16]; if (thread < threads) { + // phi2 filter (2 hash chain branches) + if (d_filter && d_filter[thread] != filter_val) return; + uint32_t *Hash = &g_hash[thread<<4]; *(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]); @@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) *(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0]; *(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8]; - __syncthreads(); - const uint32_t P[48] = { 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, //8-12 @@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) W[48 + i + 4] = a ^ cd ^ bcx; W[48 + i + 8] = d ^ ab ^ cdx; W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; - } for (int k = 1; k < 10; k++) @@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) } __host__ -void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ - +void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) +{ const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - x16_echo512_gpu_hash_64<<>>(threads, d_hash); + x16_echo512_gpu_hash_64 <<>> (threads, d_hash, NULL, 0); } + +__host__ +void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter) +{ + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + x16_echo512_gpu_hash_64 <<>> (threads, g_hash, d_filter, 0); +} \ No newline at end of file