mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-24 21:45:04 +00:00
phi2 algo
This commit is contained in:
parent
b8190e4aa7
commit
3d03a1b9fd
@ -81,7 +81,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
|
|||||||
x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
|
x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
|
||||||
x16/cuda_x16_echo512_64.cu \
|
x16/cuda_x16_echo512_64.cu \
|
||||||
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
|
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
|
||||||
x11/phi.cu x11/cuda_streebog_maxwell.cu \
|
phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \
|
||||||
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
|
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
|
||||||
|
|
||||||
# scrypt
|
# scrypt
|
||||||
|
2
algos.h
2
algos.h
@ -39,6 +39,7 @@ enum sha_algos {
|
|||||||
ALGO_NIST5,
|
ALGO_NIST5,
|
||||||
ALGO_PENTABLAKE,
|
ALGO_PENTABLAKE,
|
||||||
ALGO_PHI,
|
ALGO_PHI,
|
||||||
|
ALGO_PHI2,
|
||||||
ALGO_POLYTIMOS,
|
ALGO_POLYTIMOS,
|
||||||
ALGO_QUARK,
|
ALGO_QUARK,
|
||||||
ALGO_QUBIT,
|
ALGO_QUBIT,
|
||||||
@ -112,6 +113,7 @@ static const char *algo_names[] = {
|
|||||||
"nist5",
|
"nist5",
|
||||||
"penta",
|
"penta",
|
||||||
"phi",
|
"phi",
|
||||||
|
"phi2",
|
||||||
"polytimos",
|
"polytimos",
|
||||||
"quark",
|
"quark",
|
||||||
"qubit",
|
"qubit",
|
||||||
|
@ -269,7 +269,8 @@ Options:\n\
|
|||||||
neoscrypt FeatherCoin, Phoenix, UFO...\n\
|
neoscrypt FeatherCoin, Phoenix, UFO...\n\
|
||||||
nist5 NIST5 (TalkCoin)\n\
|
nist5 NIST5 (TalkCoin)\n\
|
||||||
penta Pentablake hash (5x Blake 512)\n\
|
penta Pentablake hash (5x Blake 512)\n\
|
||||||
phi BHCoin\n\
|
phi LUX initial algo\n\
|
||||||
|
phi2 LUX v2 with lyra2\n\
|
||||||
polytimos Politimos\n\
|
polytimos Politimos\n\
|
||||||
quark Quark\n\
|
quark Quark\n\
|
||||||
qubit Qubit\n\
|
qubit Qubit\n\
|
||||||
@ -1708,6 +1709,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
|
|||||||
case ALGO_LBRY:
|
case ALGO_LBRY:
|
||||||
case ALGO_LYRA2v2:
|
case ALGO_LYRA2v2:
|
||||||
case ALGO_LYRA2Z:
|
case ALGO_LYRA2Z:
|
||||||
|
case ALGO_PHI2:
|
||||||
case ALGO_TIMETRAVEL:
|
case ALGO_TIMETRAVEL:
|
||||||
case ALGO_BITCORE:
|
case ALGO_BITCORE:
|
||||||
case ALGO_X16R:
|
case ALGO_X16R:
|
||||||
@ -2245,6 +2247,7 @@ static void *miner_thread(void *userdata)
|
|||||||
case ALGO_HSR:
|
case ALGO_HSR:
|
||||||
case ALGO_LYRA2v2:
|
case ALGO_LYRA2v2:
|
||||||
case ALGO_PHI:
|
case ALGO_PHI:
|
||||||
|
case ALGO_PHI2:
|
||||||
case ALGO_POLYTIMOS:
|
case ALGO_POLYTIMOS:
|
||||||
case ALGO_S3:
|
case ALGO_S3:
|
||||||
case ALGO_SKUNK:
|
case ALGO_SKUNK:
|
||||||
@ -2436,6 +2439,9 @@ static void *miner_thread(void *userdata)
|
|||||||
case ALGO_PHI:
|
case ALGO_PHI:
|
||||||
rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
|
rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
|
||||||
break;
|
break;
|
||||||
|
case ALGO_PHI2:
|
||||||
|
rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done);
|
||||||
|
break;
|
||||||
case ALGO_POLYTIMOS:
|
case ALGO_POLYTIMOS:
|
||||||
rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done);
|
rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done);
|
||||||
break;
|
break;
|
||||||
|
@ -525,6 +525,7 @@
|
|||||||
<CudaCompile Include="lyra2\lyra2REv2.cu" />
|
<CudaCompile Include="lyra2\lyra2REv2.cu" />
|
||||||
<CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
|
<CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
|
||||||
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
|
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
|
||||||
|
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
|
||||||
<ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
|
<ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
|
||||||
<CudaCompile Include="lyra2\lyra2Z.cu" />
|
<CudaCompile Include="lyra2\lyra2Z.cu" />
|
||||||
<CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
|
<CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
|
||||||
@ -537,6 +538,9 @@
|
|||||||
<CudaCompile Include="cuda_skeincoin.cu">
|
<CudaCompile Include="cuda_skeincoin.cu">
|
||||||
<MaxRegCount>48</MaxRegCount>
|
<MaxRegCount>48</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="phi\phi.cu" />
|
||||||
|
<CudaCompile Include="phi\phi2.cu" />
|
||||||
|
<CudaCompile Include="phi\cuda_phi2.cu" />
|
||||||
<CudaCompile Include="skunk\skunk.cu" />
|
<CudaCompile Include="skunk\skunk.cu" />
|
||||||
<CudaCompile Include="skunk\cuda_skunk.cu">
|
<CudaCompile Include="skunk\cuda_skunk.cu">
|
||||||
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
|
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
|
||||||
@ -567,7 +571,6 @@
|
|||||||
<CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
|
<CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
|
||||||
<CudaCompile Include="x11\c11.cu" />
|
<CudaCompile Include="x11\c11.cu" />
|
||||||
<CudaCompile Include="x11\fresh.cu" />
|
<CudaCompile Include="x11\fresh.cu" />
|
||||||
<CudaCompile Include="x11\phi.cu" />
|
|
||||||
<CudaCompile Include="x11\sib.cu" />
|
<CudaCompile Include="x11\sib.cu" />
|
||||||
<CudaCompile Include="x11\s3.cu" />
|
<CudaCompile Include="x11\s3.cu" />
|
||||||
<CudaCompile Include="x11\timetravel.cu" />
|
<CudaCompile Include="x11\timetravel.cu" />
|
||||||
|
@ -115,7 +115,10 @@
|
|||||||
<Filter Include="Source Files\CUDA\tribus">
|
<Filter Include="Source Files\CUDA\tribus">
|
||||||
<UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
|
<UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
|
||||||
</Filter>
|
</Filter>
|
||||||
<Filter Include="Source Files\CUDA\x12">
|
<Filter Include="Source Files\CUDA\phi">
|
||||||
|
<UniqueIdentifier>{311e8d79-1612-4f0f-8591-23a592f2b2d3}</UniqueIdentifier>
|
||||||
|
</Filter>
|
||||||
|
<Filter Include="Source Files\CUDA\x12">
|
||||||
<UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
|
<UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
|
||||||
</Filter>
|
</Filter>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
@ -545,6 +548,9 @@
|
|||||||
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
|
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
|
||||||
<Filter>Source Files\CUDA\lyra2</Filter>
|
<Filter>Source Files\CUDA\lyra2</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
|
||||||
|
<Filter>Source Files\CUDA\lyra2</Filter>
|
||||||
|
</ClInclude>
|
||||||
<ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
|
<ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
|
||||||
<Filter>Source Files\CUDA\lyra2</Filter>
|
<Filter>Source Files\CUDA\lyra2</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
@ -781,6 +787,15 @@
|
|||||||
<CudaCompile Include="polytimos.cu">
|
<CudaCompile Include="polytimos.cu">
|
||||||
<Filter>Source Files\CUDA</Filter>
|
<Filter>Source Files\CUDA</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="phi\phi.cu">
|
||||||
|
<Filter>Source Files\CUDA\phi</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="phi\phi2.cu">
|
||||||
|
<Filter>Source Files\CUDA\phi</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="phi\cuda_phi2.cu">
|
||||||
|
<Filter>Source Files\CUDA\phi</Filter>
|
||||||
|
</CudaCompile>
|
||||||
<CudaCompile Include="skunk\skunk.cu">
|
<CudaCompile Include="skunk\skunk.cu">
|
||||||
<Filter>Source Files\CUDA\skunk</Filter>
|
<Filter>Source Files\CUDA\skunk</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
@ -799,9 +814,6 @@
|
|||||||
<ClInclude Include="tribus\cuda_echo512_aes.cuh">
|
<ClInclude Include="tribus\cuda_echo512_aes.cuh">
|
||||||
<Filter>Source Files\CUDA\tribus</Filter>
|
<Filter>Source Files\CUDA\tribus</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
<CudaCompile Include="x11\phi.cu">
|
|
||||||
<Filter>Source Files\CUDA\x11</Filter>
|
|
||||||
</CudaCompile>
|
|
||||||
<CudaCompile Include="x11\sib.cu">
|
<CudaCompile Include="x11\sib.cu">
|
||||||
<Filter>Source Files\CUDA\x11</Filter>
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Lyra2 (v1) cuda implementation based on djm34 work
|
* Lyra2 (v1) cuda implementation based on djm34 work
|
||||||
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
|
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
|
||||||
|
* tpruvot@github 2018 for phi2 double lyra2-32 support
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
|
|||||||
{
|
{
|
||||||
uint2 state1[3];
|
uint2 state1[3];
|
||||||
|
|
||||||
#if __CUDA_ARCH__ > 500
|
#pragma unroll
|
||||||
#pragma unroll
|
|
||||||
#endif
|
|
||||||
for (int i = 0; i < Nrow; i++)
|
for (int i = 0; i < Nrow; i++)
|
||||||
{
|
{
|
||||||
ST4S(0, Ncol - i - 1, state, thread, threads);
|
ST4S(0, Ncol - i - 1, state, thread, threads);
|
||||||
@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
|
|||||||
LD4S(state1, rowIn, i, thread, threads);
|
LD4S(state1, rowIn, i, thread, threads);
|
||||||
LD4S(state2, rowInOut, i, thread, threads);
|
LD4S(state2, rowInOut, i, thread, threads);
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < 3; j++)
|
for (int j = 0; j < 3; j++)
|
||||||
state[j] ^= state1[j] + state2[j];
|
state[j] ^= state1[j] + state2[j];
|
||||||
|
|
||||||
@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
|
|||||||
|
|
||||||
LD4S(state1, rowOut, i, thread, threads);
|
LD4S(state1, rowOut, i, thread, threads);
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < 3; j++)
|
for (int j = 0; j < 3; j++)
|
||||||
state1[j] ^= state[j];
|
state1[j] ^= state[j];
|
||||||
|
|
||||||
@ -412,11 +411,9 @@ __global__ __launch_bounds__(64, 1)
|
|||||||
void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
|
void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
|
||||||
{
|
{
|
||||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint2x4 state[4];
|
uint2x4 state[4];
|
||||||
|
|
||||||
state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
|
state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
|
||||||
state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
|
state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
|
||||||
state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
|
state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
|
||||||
@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
|
|||||||
|
|
||||||
__global__
|
__global__
|
||||||
__launch_bounds__(TPB52, 1)
|
__launch_bounds__(TPB52, 1)
|
||||||
void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash)
|
void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash)
|
||||||
{
|
{
|
||||||
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
|
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
|
||||||
|
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint2 state[4];
|
uint2 state[4];
|
||||||
@ -484,11 +480,9 @@ __global__ __launch_bounds__(64, 1)
|
|||||||
void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
|
void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
|
||||||
{
|
{
|
||||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
uint28 state[4];
|
|
||||||
|
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
|
uint2x4 state[4];
|
||||||
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
|
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
|
||||||
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
|
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
|
||||||
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
|
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
|
||||||
@ -501,7 +495,57 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
|
|||||||
g_hash[thread + threads * 1] = state[0].y;
|
g_hash[thread + threads * 1] = state[0].y;
|
||||||
g_hash[thread + threads * 2] = state[0].z;
|
g_hash[thread + threads * 2] = state[0].z;
|
||||||
g_hash[thread + threads * 3] = state[0].w;
|
g_hash[thread + threads * 3] = state[0].w;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(64, 1)
|
||||||
|
void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
|
||||||
|
{
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint2x4 state[4];
|
||||||
|
const size_t offset = (size_t)8 * thread + (round * 4U);
|
||||||
|
uint2 *psrc = (uint2*)(&d_hash_512[offset]);
|
||||||
|
state[0].x = state[1].x = __ldg(&psrc[0]);
|
||||||
|
state[0].y = state[1].y = __ldg(&psrc[1]);
|
||||||
|
state[0].z = state[1].z = __ldg(&psrc[2]);
|
||||||
|
state[0].w = state[1].w = __ldg(&psrc[3]);
|
||||||
|
state[2] = blake2b_IV[0];
|
||||||
|
state[3] = blake2b_IV[1];
|
||||||
|
|
||||||
|
for (int i = 0; i<24; i++)
|
||||||
|
round_lyra(state);
|
||||||
|
|
||||||
|
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
|
||||||
|
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
|
||||||
|
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
|
||||||
|
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(64, 1)
|
||||||
|
void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
|
||||||
|
{
|
||||||
|
// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds
|
||||||
|
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint2x4 state[4];
|
||||||
|
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
|
||||||
|
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
|
||||||
|
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
|
||||||
|
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
|
||||||
|
|
||||||
|
for (int i = 0; i < 12; i++)
|
||||||
|
round_lyra(state);
|
||||||
|
|
||||||
|
const size_t offset = (size_t)8 * thread + (round * 4U);
|
||||||
|
uint2 *pdst = (uint2*)(&d_hash_512[offset]);
|
||||||
|
pdst[0] = state[0].x;
|
||||||
|
pdst[1] = state[0].y;
|
||||||
|
pdst[2] = state[0].z;
|
||||||
|
pdst[3] = state[0].w;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
@ -513,6 +557,8 @@ __device__ void* DMatrix;
|
|||||||
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
|
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
|
||||||
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
|
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
|
||||||
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
|
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
|
||||||
|
__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
|
||||||
|
__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
@ -545,9 +591,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7
|
|||||||
if (cuda_arch[dev_id] >= 520)
|
if (cuda_arch[dev_id] >= 520)
|
||||||
{
|
{
|
||||||
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
|
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
|
||||||
|
|
||||||
lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);
|
lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);
|
||||||
|
|
||||||
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
|
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
|
||||||
}
|
}
|
||||||
else if (cuda_arch[dev_id] >= 500)
|
else if (cuda_arch[dev_id] >= 500)
|
||||||
@ -562,11 +606,57 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7
|
|||||||
shared_mem = 6144;
|
shared_mem = 6144;
|
||||||
|
|
||||||
lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
|
lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
|
||||||
|
|
||||||
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);
|
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);
|
||||||
|
|
||||||
lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
|
lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
|
lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti)
|
||||||
|
{
|
||||||
|
int dev_id = device_map[thr_id % MAX_GPUS];
|
||||||
|
uint32_t tpb = TPB52;
|
||||||
|
if (cuda_arch[dev_id] >= 520) tpb = TPB52;
|
||||||
|
else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
|
||||||
|
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
|
||||||
|
|
||||||
|
dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb);
|
||||||
|
dim3 block1(4, tpb >> 2);
|
||||||
|
|
||||||
|
dim3 grid2((threads + 64 - 1) / 64);
|
||||||
|
dim3 block2(64);
|
||||||
|
|
||||||
|
if (cuda_arch[dev_id] >= 520)
|
||||||
|
{
|
||||||
|
const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152;
|
||||||
|
lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
|
||||||
|
lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
|
||||||
|
lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
|
||||||
|
|
||||||
|
lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
|
||||||
|
lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
|
||||||
|
lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
|
||||||
|
}
|
||||||
|
else if (cuda_arch[dev_id] >= 500)
|
||||||
|
{
|
||||||
|
size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps
|
||||||
|
lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
|
||||||
|
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
|
||||||
|
lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
|
||||||
|
|
||||||
|
lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
|
||||||
|
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
|
||||||
|
lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// alternative method for SM 3.x
|
||||||
|
hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
|
||||||
|
lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
|
||||||
|
hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
|
||||||
|
hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
|
||||||
|
lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
|
||||||
|
hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
#ifdef __INTELLISENSE__
|
#ifdef __INTELLISENSE__
|
||||||
/* just for vstudio code colors, only uncomment that temporary, dont commit it */
|
/* just for vstudio code colors, only uncomment that temporary, dont commit it */
|
||||||
//#undef __CUDA_ARCH__
|
//#undef __CUDA_ARCH__
|
||||||
//#define __CUDA_ARCH__ 500
|
//#define __CUDA_ARCH__ 300
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
@ -226,3 +226,66 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash)
|
|||||||
/* if __CUDA_ARCH__ < 200 .. host */
|
/* if __CUDA_ARCH__ < 200 .. host */
|
||||||
__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {}
|
__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// -------------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo...
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 8)
|
||||||
|
void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
|
||||||
|
{
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
const size_t offset = (size_t) 16 * thread + (round * 8U);
|
||||||
|
uint2 *psrc = (uint2*) (&d_hash64[offset]);
|
||||||
|
uint2 *pdst = (uint2*) (&d_hash_lyra[thread]);
|
||||||
|
pdst[threads*0] = __ldg(&psrc[0]);
|
||||||
|
pdst[threads*1] = __ldg(&psrc[1]);
|
||||||
|
pdst[threads*2] = __ldg(&psrc[2]);
|
||||||
|
pdst[threads*3] = __ldg(&psrc[3]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 8)
|
||||||
|
void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
|
||||||
|
{
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
const size_t offset = (size_t) 16 * thread + (round * 8U);
|
||||||
|
uint2 *psrc = (uint2*) (&d_hash_lyra[thread]);
|
||||||
|
uint2 *pdst = (uint2*) (&d_hash64[offset]);
|
||||||
|
pdst[0] = psrc[0];
|
||||||
|
pdst[1] = psrc[threads*1];
|
||||||
|
pdst[2] = psrc[threads*2];
|
||||||
|
pdst[3] = psrc[threads*3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
/* if __CUDA_ARCH__ < 200 .. host */
|
||||||
|
__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
|
||||||
|
__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
hash64_to_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
hash64_from_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
|
||||||
|
}
|
||||||
|
@ -591,13 +591,12 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr
|
|||||||
__global__ __launch_bounds__(64, 1)
|
__global__ __launch_bounds__(64, 1)
|
||||||
void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash)
|
void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash)
|
||||||
{
|
{
|
||||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
|
||||||
|
|
||||||
const uint2x4 blake2b_IV[2] = {
|
const uint2x4 blake2b_IV[2] = {
|
||||||
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
|
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
|
||||||
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
|
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint2x4 state[4];
|
uint2x4 state[4];
|
||||||
@ -629,7 +628,6 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash)
|
|||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint2 state[4];
|
uint2 state[4];
|
||||||
|
|
||||||
state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
|
state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
|
||||||
state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
|
state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
|
||||||
state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
|
state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
|
||||||
@ -669,7 +667,6 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
|
|||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
uint2x4 state[4];
|
uint2x4 state[4];
|
||||||
|
|
||||||
state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
|
state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
|
||||||
state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
|
state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
|
||||||
state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
|
state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
|
||||||
@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(64, 1)
|
||||||
|
void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
|
||||||
|
{
|
||||||
|
const uint2x4 blake2b_IV[2] = {
|
||||||
|
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
|
||||||
|
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
|
||||||
|
};
|
||||||
|
// This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint2x4 state[4];
|
||||||
|
const size_t offset = (size_t)8 * thread + (round * 4U);
|
||||||
|
uint2 *psrc = (uint2*)(&d_hash_512[offset]);
|
||||||
|
state[0].x = state[1].x = __ldg(&psrc[0]);
|
||||||
|
state[0].y = state[1].y = __ldg(&psrc[1]);
|
||||||
|
state[0].z = state[1].z = __ldg(&psrc[2]);
|
||||||
|
state[0].w = state[1].w = __ldg(&psrc[3]);
|
||||||
|
|
||||||
|
state[1] = state[0];
|
||||||
|
state[2] = blake2b_IV[0];
|
||||||
|
state[3] = blake2b_IV[1];
|
||||||
|
|
||||||
|
for (int i = 0; i<24; i++)
|
||||||
|
round_lyra(state);
|
||||||
|
|
||||||
|
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
|
||||||
|
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
|
||||||
|
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
|
||||||
|
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(64, 1)
|
||||||
|
void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
|
||||||
|
{
|
||||||
|
// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps
|
||||||
|
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint2x4 state[4];
|
||||||
|
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
|
||||||
|
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
|
||||||
|
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
|
||||||
|
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
|
||||||
|
|
||||||
|
for (int i = 0; i < 12; i++)
|
||||||
|
round_lyra(state);
|
||||||
|
|
||||||
|
const size_t offset = (size_t)8 * thread + (round * 4U);
|
||||||
|
uint2 *pdst = (uint2*)(&d_hash_512[offset]);
|
||||||
|
pdst[0] = state[0].x;
|
||||||
|
pdst[1] = state[0].y;
|
||||||
|
pdst[2] = state[0].z;
|
||||||
|
pdst[3] = state[0].w;
|
||||||
|
}
|
||||||
|
}
|
||||||
#else
|
#else
|
||||||
/* if __CUDA_ARCH__ != 500 .. host */
|
/* if __CUDA_ARCH__ != 500 .. host */
|
||||||
__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {}
|
__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {}
|
||||||
__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {}
|
__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {}
|
||||||
__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {}
|
__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {}
|
||||||
|
__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
|
||||||
|
__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
|
||||||
#endif
|
#endif
|
||||||
|
5
miner.h
5
miner.h
@ -303,6 +303,7 @@ extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce,
|
|||||||
extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
|
extern int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||||
@ -371,6 +372,7 @@ extern void free_neoscrypt(int thr_id);
|
|||||||
extern void free_nist5(int thr_id);
|
extern void free_nist5(int thr_id);
|
||||||
extern void free_pentablake(int thr_id);
|
extern void free_pentablake(int thr_id);
|
||||||
extern void free_phi(int thr_id);
|
extern void free_phi(int thr_id);
|
||||||
|
extern void free_phi2(int thr_id);
|
||||||
extern void free_polytimos(int thr_id);
|
extern void free_polytimos(int thr_id);
|
||||||
extern void free_quark(int thr_id);
|
extern void free_quark(int thr_id);
|
||||||
extern void free_qubit(int thr_id);
|
extern void free_qubit(int thr_id);
|
||||||
@ -918,7 +920,8 @@ void myriadhash(void *state, const void *input);
|
|||||||
void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
|
void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
|
||||||
void nist5hash(void *state, const void *input);
|
void nist5hash(void *state, const void *input);
|
||||||
void pentablakehash(void *output, const void *input);
|
void pentablakehash(void *output, const void *input);
|
||||||
void phihash(void *output, const void *input);
|
void phi_hash(void *output, const void *input);
|
||||||
|
void phi2_hash(void *output, const void *input);
|
||||||
void polytimos_hash(void *output, const void *input);
|
void polytimos_hash(void *output, const void *input);
|
||||||
void quarkhash(void *state, const void *input);
|
void quarkhash(void *state, const void *input);
|
||||||
void qubithash(void *state, const void *input);
|
void qubithash(void *state, const void *input);
|
||||||
|
89
phi/cuda_phi2.cu
Normal file
89
phi/cuda_phi2.cu
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
#include <memory.h>
|
||||||
|
|
||||||
|
#include "cuda_helper.h"
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 8)
|
||||||
|
void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
|
||||||
|
{
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
|
||||||
|
uint4 *psrc = (uint4*) (&d_hash[offset]);
|
||||||
|
d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1;
|
||||||
|
if (d_NonceBranch[thread]) return;
|
||||||
|
if (d_branch2) {
|
||||||
|
uint4 *pdst = (uint4*)(&d_branch2[offset]);
|
||||||
|
uint4 data;
|
||||||
|
data = psrc[0]; pdst[0] = data;
|
||||||
|
data = psrc[1]; pdst[1] = data;
|
||||||
|
data = psrc[2]; pdst[2] = data;
|
||||||
|
data = psrc[3]; pdst[3] = data;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 8)
|
||||||
|
void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
|
||||||
|
{
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads && !d_NonceBranch[thread])
|
||||||
|
{
|
||||||
|
const uint32_t offset = thread * 16U;
|
||||||
|
uint4 *psrc = (uint4*) (&d_branch2[offset]);
|
||||||
|
uint4 *pdst = (uint4*) (&d_hash[offset]);
|
||||||
|
uint4 data;
|
||||||
|
data = psrc[0]; pdst[0] = data;
|
||||||
|
data = psrc[1]; pdst[1] = data;
|
||||||
|
data = psrc[2]; pdst[2] = data;
|
||||||
|
data = psrc[3]; pdst[3] = data;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__
|
||||||
|
void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash)
|
||||||
|
{
|
||||||
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
const uint32_t offset = thread * 16U;
|
||||||
|
uint2 *psrc = (uint2*) (&d_hash[offset]);
|
||||||
|
uint2 *pdst = (uint2*) (&d_hash[offset]);
|
||||||
|
uint2 data;
|
||||||
|
data = psrc[4]; pdst[0] ^= data;
|
||||||
|
data = psrc[5]; pdst[1] ^= data;
|
||||||
|
data = psrc[6]; pdst[2] ^= data;
|
||||||
|
data = psrc[7]; pdst[3] ^= data;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
// extract algo permution hashes to a second branch buffer
|
||||||
|
phi_filter_gpu <<<grid, block>>> (threads, inpHashes, d_br2, d_nonces);
|
||||||
|
return threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
// put back second branch hashes to the common buffer d_hash
|
||||||
|
phi_merge_gpu <<<grid, block>>> (threads, outpHashes, d_br2, d_nonces);
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
phi_final_compress_gpu <<<grid, block>>> (threads, d_hashes);
|
||||||
|
}
|
@ -19,7 +19,7 @@ extern "C" {
|
|||||||
|
|
||||||
#include "miner.h"
|
#include "miner.h"
|
||||||
#include "cuda_helper.h"
|
#include "cuda_helper.h"
|
||||||
#include "cuda_x11.h"
|
#include "x11/cuda_x11.h"
|
||||||
|
|
||||||
extern void skein512_cpu_setBlock_80(void *pdata);
|
extern void skein512_cpu_setBlock_80(void *pdata);
|
||||||
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
|
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
|
||||||
@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash,
|
|||||||
static uint32_t *d_hash[MAX_GPUS];
|
static uint32_t *d_hash[MAX_GPUS];
|
||||||
static uint32_t *d_resNonce[MAX_GPUS];
|
static uint32_t *d_resNonce[MAX_GPUS];
|
||||||
|
|
||||||
extern "C" void phihash(void *output, const void *input)
|
extern "C" void phi_hash(void *output, const void *input)
|
||||||
{
|
{
|
||||||
unsigned char _ALIGN(128) hash[128] = { 0 };
|
unsigned char _ALIGN(128) hash[128] = { 0 };
|
||||||
|
|
||||||
@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
uint32_t _ALIGN(64) vhash[8];
|
uint32_t _ALIGN(64) vhash[8];
|
||||||
if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
|
if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
|
||||||
be32enc(&endiandata[19], work->nonces[0]);
|
be32enc(&endiandata[19], work->nonces[0]);
|
||||||
phihash(vhash, endiandata);
|
phi_hash(vhash, endiandata);
|
||||||
|
|
||||||
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
||||||
work->valid_nonces = 1;
|
work->valid_nonces = 1;
|
||||||
@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
|
|||||||
if (work->nonces[1] != UINT32_MAX) {
|
if (work->nonces[1] != UINT32_MAX) {
|
||||||
work->nonces[1] += startNonce;
|
work->nonces[1] += startNonce;
|
||||||
be32enc(&endiandata[19], work->nonces[1]);
|
be32enc(&endiandata[19], work->nonces[1]);
|
||||||
phihash(vhash, endiandata);
|
phi_hash(vhash, endiandata);
|
||||||
bn_set_target_ratio(work, vhash, 1);
|
bn_set_target_ratio(work, vhash, 1);
|
||||||
work->valid_nonces++;
|
work->valid_nonces++;
|
||||||
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
|
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
|
255
phi/phi2.cu
Normal file
255
phi/phi2.cu
Normal file
@ -0,0 +1,255 @@
|
|||||||
|
//
|
||||||
|
// PHI2 algo
|
||||||
|
// CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein
|
||||||
|
//
|
||||||
|
// Implemented by tpruvot in May 2018
|
||||||
|
//
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
#include "sph/sph_skein.h"
|
||||||
|
#include "sph/sph_jh.h"
|
||||||
|
#include "sph/sph_cubehash.h"
|
||||||
|
#include "sph/sph_streebog.h"
|
||||||
|
#include "sph/sph_echo.h"
|
||||||
|
#include "lyra2/Lyra2.h"
|
||||||
|
}
|
||||||
|
|
||||||
|
#include "miner.h"
|
||||||
|
#include "cuda_helper.h"
|
||||||
|
#include "x11/cuda_x11.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <memory.h>
|
||||||
|
|
||||||
|
extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
|
||||||
|
extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
|
||||||
|
|
||||||
|
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
|
||||||
|
extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti);
|
||||||
|
|
||||||
|
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
|
||||||
|
extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter);
|
||||||
|
extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter);
|
||||||
|
|
||||||
|
extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces);
|
||||||
|
extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces);
|
||||||
|
extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes);
|
||||||
|
|
||||||
|
static uint64_t* d_matrix[MAX_GPUS];
|
||||||
|
static uint32_t* d_hash_512[MAX_GPUS];
|
||||||
|
static uint64_t* d_hash_256[MAX_GPUS];
|
||||||
|
static uint32_t* d_hash_br2[MAX_GPUS];
|
||||||
|
static uint32_t* d_nonce_br[MAX_GPUS];
|
||||||
|
|
||||||
|
extern "C" void phi2_hash(void *output, const void *input)
|
||||||
|
{
|
||||||
|
unsigned char _ALIGN(128) hash[128] = { 0 };
|
||||||
|
unsigned char _ALIGN(128) hashA[64] = { 0 };
|
||||||
|
unsigned char _ALIGN(128) hashB[64] = { 0 };
|
||||||
|
|
||||||
|
sph_cubehash512_context ctx_cubehash;
|
||||||
|
sph_jh512_context ctx_jh;
|
||||||
|
sph_gost512_context ctx_gost;
|
||||||
|
sph_echo512_context ctx_echo;
|
||||||
|
sph_skein512_context ctx_skein;
|
||||||
|
|
||||||
|
sph_cubehash512_init(&ctx_cubehash);
|
||||||
|
sph_cubehash512(&ctx_cubehash, input, 80);
|
||||||
|
sph_cubehash512_close(&ctx_cubehash, (void*)hashB);
|
||||||
|
|
||||||
|
LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8);
|
||||||
|
LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8);
|
||||||
|
|
||||||
|
sph_jh512_init(&ctx_jh);
|
||||||
|
sph_jh512(&ctx_jh, (const void*)hashA, 64);
|
||||||
|
sph_jh512_close(&ctx_jh, (void*)hash);
|
||||||
|
|
||||||
|
if (hash[0] & 1) {
|
||||||
|
sph_gost512_init(&ctx_gost);
|
||||||
|
sph_gost512(&ctx_gost, (const void*)hash, 64);
|
||||||
|
sph_gost512_close(&ctx_gost, (void*)hash);
|
||||||
|
} else {
|
||||||
|
sph_echo512_init(&ctx_echo);
|
||||||
|
sph_echo512(&ctx_echo, (const void*)hash, 64);
|
||||||
|
sph_echo512_close(&ctx_echo, (void*)hash);
|
||||||
|
|
||||||
|
sph_echo512_init(&ctx_echo);
|
||||||
|
sph_echo512(&ctx_echo, (const void*)hash, 64);
|
||||||
|
sph_echo512_close(&ctx_echo, (void*)hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
sph_skein512_init(&ctx_skein);
|
||||||
|
sph_skein512(&ctx_skein, (const void*)hash, 64);
|
||||||
|
sph_skein512_close(&ctx_skein, (void*)hash);
|
||||||
|
|
||||||
|
for (int i=0; i<32; i++)
|
||||||
|
hash[i] ^= hash[i+32];
|
||||||
|
|
||||||
|
memcpy(output, hash, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
//#define _DEBUG
|
||||||
|
#define _DEBUG_PREFIX "phi-"
|
||||||
|
#include "cuda_debug.cuh"
|
||||||
|
|
||||||
|
static bool init[MAX_GPUS] = { 0 };
|
||||||
|
static bool use_compat_kernels[MAX_GPUS] = { 0 };
|
||||||
|
static __thread bool gtx750ti = false;
|
||||||
|
|
||||||
|
extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||||
|
{
|
||||||
|
uint32_t *pdata = work->data;
|
||||||
|
uint32_t *ptarget = work->target;
|
||||||
|
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
const int dev_id = device_map[thr_id];
|
||||||
|
|
||||||
|
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
|
||||||
|
if (device_sm[dev_id] == 500) intensity = 15;
|
||||||
|
if (device_sm[dev_id] == 600) intensity = 17;
|
||||||
|
|
||||||
|
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
|
||||||
|
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
|
||||||
|
if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem
|
||||||
|
|
||||||
|
if (opt_benchmark)
|
||||||
|
ptarget[7] = 0xff;
|
||||||
|
|
||||||
|
if (!init[thr_id])
|
||||||
|
{
|
||||||
|
cudaSetDevice(dev_id);
|
||||||
|
if (opt_cudaschedule == -1 && gpu_threads == 1) {
|
||||||
|
cudaDeviceReset();
|
||||||
|
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
|
||||||
|
}
|
||||||
|
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
|
||||||
|
|
||||||
|
cuda_get_arch(thr_id);
|
||||||
|
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
|
||||||
|
gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL);
|
||||||
|
|
||||||
|
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
|
||||||
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1);
|
||||||
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1);
|
||||||
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1);
|
||||||
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1);
|
||||||
|
if (use_compat_kernels[thr_id]) {
|
||||||
|
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
x11_cubehash512_cpu_init(thr_id, throughput);
|
||||||
|
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
|
||||||
|
quark_jh512_cpu_init(thr_id, throughput);
|
||||||
|
quark_skein512_cpu_init(thr_id, throughput);
|
||||||
|
if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput);
|
||||||
|
|
||||||
|
cuda_check_cpu_init(thr_id, throughput);
|
||||||
|
init[thr_id] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t endiandata[20];
|
||||||
|
for (int k = 0; k < 20; k++)
|
||||||
|
be32enc(&endiandata[k], pdata[k]);
|
||||||
|
|
||||||
|
cuda_check_cpu_setTarget(ptarget);
|
||||||
|
cubehash512_setBlock_80(thr_id, endiandata);
|
||||||
|
|
||||||
|
do {
|
||||||
|
int order = 0;
|
||||||
|
|
||||||
|
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); order++;
|
||||||
|
TRACE("cube ");
|
||||||
|
|
||||||
|
lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti);
|
||||||
|
order++;
|
||||||
|
TRACE("lyra ");
|
||||||
|
|
||||||
|
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
|
||||||
|
TRACE("jh ");
|
||||||
|
|
||||||
|
order++;
|
||||||
|
if (!use_compat_kernels[thr_id]) {
|
||||||
|
phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]);
|
||||||
|
phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
|
||||||
|
phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
|
||||||
|
phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
|
||||||
|
} else {
|
||||||
|
// todo: nonces vector to reduce amount of hashes to compute
|
||||||
|
phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
|
||||||
|
streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]);
|
||||||
|
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
|
||||||
|
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
|
||||||
|
phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
|
||||||
|
}
|
||||||
|
TRACE("mix ");
|
||||||
|
|
||||||
|
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
|
||||||
|
TRACE("skein ");
|
||||||
|
|
||||||
|
phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]);
|
||||||
|
TRACE("xor ");
|
||||||
|
|
||||||
|
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
|
||||||
|
if (work->nonces[0] != UINT32_MAX)
|
||||||
|
{
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
uint32_t _ALIGN(64) vhash[8];
|
||||||
|
be32enc(&endiandata[19], work->nonces[0]);
|
||||||
|
phi2_hash(vhash, endiandata);
|
||||||
|
|
||||||
|
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
||||||
|
work->valid_nonces = 1;
|
||||||
|
work_set_target_ratio(work, vhash);
|
||||||
|
*hashes_done = pdata[19] - first_nonce + throughput;
|
||||||
|
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1);
|
||||||
|
if (work->nonces[1] != 0) {
|
||||||
|
be32enc(&endiandata[19], work->nonces[1]);
|
||||||
|
phi2_hash(vhash, endiandata);
|
||||||
|
bn_set_target_ratio(work, vhash, 1);
|
||||||
|
work->valid_nonces++;
|
||||||
|
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
|
||||||
|
} else {
|
||||||
|
pdata[19] = work->nonces[0] + 1; // cursor
|
||||||
|
}
|
||||||
|
if (pdata[19] > max_nonce) pdata[19] = max_nonce;
|
||||||
|
return work->valid_nonces;
|
||||||
|
}
|
||||||
|
else if (vhash[7] > Htarg) {
|
||||||
|
gpu_increment_reject(thr_id);
|
||||||
|
if (!opt_quiet)
|
||||||
|
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput);
|
||||||
|
pdata[19] = work->nonces[0] + 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
|
||||||
|
pdata[19] = max_nonce;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
pdata[19] += throughput;
|
||||||
|
|
||||||
|
} while (!work_restart[thr_id].restart);
|
||||||
|
|
||||||
|
*hashes_done = pdata[19] - first_nonce;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// cleanup
|
||||||
|
extern "C" void free_phi2(int thr_id)
|
||||||
|
{
|
||||||
|
if (!init[thr_id])
|
||||||
|
return;
|
||||||
|
|
||||||
|
cudaThreadSynchronize();
|
||||||
|
cudaFree(d_matrix[thr_id]);
|
||||||
|
cudaFree(d_hash_512[thr_id]);
|
||||||
|
cudaFree(d_hash_256[thr_id]);
|
||||||
|
cudaFree(d_nonce_br[thr_id]);
|
||||||
|
if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]);
|
||||||
|
|
||||||
|
cuda_check_cpu_free(thr_id);
|
||||||
|
init[thr_id] = false;
|
||||||
|
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
2
util.cpp
2
util.cpp
@ -2250,7 +2250,7 @@ void print_hash_tests(void)
|
|||||||
pentablakehash(&hash[0], &buf[0]);
|
pentablakehash(&hash[0], &buf[0]);
|
||||||
printpfx("pentablake", hash);
|
printpfx("pentablake", hash);
|
||||||
|
|
||||||
phihash(&hash[0], &buf[0]);
|
phi2_hash(&hash[0], &buf[0]);
|
||||||
printpfx("phi", hash);
|
printpfx("phi", hash);
|
||||||
|
|
||||||
polytimos_hash(&hash[0], &buf[0]);
|
polytimos_hash(&hash[0], &buf[0]);
|
||||||
|
@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3)
|
|||||||
#else
|
#else
|
||||||
__launch_bounds__(TPB, 3)
|
__launch_bounds__(TPB, 3)
|
||||||
#endif
|
#endif
|
||||||
void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
|
void streebog_gpu_hash_64_sm5(uint64_t *g_hash, uint32_t* const d_filter, const uint32_t filter_val)
|
||||||
{
|
{
|
||||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
uint2 buf[8], t[8], temp[8], K0[8], hash[8];
|
uint2 buf[8], t[8], temp[8], K0[8], hash[8];
|
||||||
@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
|
|||||||
shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
|
shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
|
||||||
shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
|
shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
|
||||||
|
|
||||||
|
//__threadfence_block();
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (d_filter && d_filter[thread] != filter_val) return;
|
||||||
|
|
||||||
uint64_t* inout = &g_hash[thread<<3];
|
uint64_t* inout = &g_hash[thread<<3];
|
||||||
|
|
||||||
*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
|
*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
|
||||||
*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
|
*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
|
||||||
|
|
||||||
__threadfence_block();
|
|
||||||
|
|
||||||
K0[0] = vectorize(0x74a5d4ce2efc83b3);
|
K0[0] = vectorize(0x74a5d4ce2efc83b3);
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
|
|||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash)
|
void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *g_hash)
|
||||||
{
|
{
|
||||||
dim3 grid((threads + TPB-1) / TPB);
|
dim3 grid((threads + TPB-1) / TPB);
|
||||||
dim3 block(TPB);
|
dim3 block(TPB);
|
||||||
streebog_gpu_hash_64_maxwell <<<grid, block>>> ((uint64_t*)d_hash);
|
streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, NULL, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter)
|
||||||
|
{
|
||||||
|
dim3 grid((threads + TPB-1) / TPB);
|
||||||
|
dim3 block(TPB);
|
||||||
|
streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, d_filter, 1);
|
||||||
}
|
}
|
||||||
|
@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W,
|
|||||||
}
|
}
|
||||||
|
|
||||||
__global__ __launch_bounds__(128, 5) /* will force 80 registers */
|
__global__ __launch_bounds__(128, 5) /* will force 80 registers */
|
||||||
static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
|
static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val)
|
||||||
{
|
{
|
||||||
__shared__ uint32_t sharedMemory[4][256];
|
__shared__ uint32_t sharedMemory[4][256];
|
||||||
|
|
||||||
aes_gpu_init128(sharedMemory);
|
aes_gpu_init128(sharedMemory);
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
uint32_t k0;
|
uint32_t k0;
|
||||||
@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
|
|||||||
uint32_t hash[16];
|
uint32_t hash[16];
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
|
// phi2 filter (2 hash chain branches)
|
||||||
|
if (d_filter && d_filter[thread] != filter_val) return;
|
||||||
|
|
||||||
uint32_t *Hash = &g_hash[thread<<4];
|
uint32_t *Hash = &g_hash[thread<<4];
|
||||||
|
|
||||||
*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
|
*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
|
||||||
@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
|
|||||||
*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
|
*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
|
||||||
*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
|
*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
const uint32_t P[48] = {
|
const uint32_t P[48] = {
|
||||||
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
|
||||||
//8-12
|
//8-12
|
||||||
@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
|
|||||||
W[48 + i + 4] = a ^ cd ^ bcx;
|
W[48 + i + 4] = a ^ cd ^ bcx;
|
||||||
W[48 + i + 8] = d ^ ab ^ cdx;
|
W[48 + i + 8] = d ^ ab ^ cdx;
|
||||||
W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
|
W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int k = 1; k < 10; k++)
|
for (int k = 1; k < 10; k++)
|
||||||
@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
|
|||||||
}
|
}
|
||||||
|
|
||||||
__host__
|
__host__
|
||||||
void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
|
void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
|
||||||
|
{
|
||||||
const uint32_t threadsperblock = 128;
|
const uint32_t threadsperblock = 128;
|
||||||
|
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
x16_echo512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
|
x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, d_hash, NULL, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__host__
|
||||||
|
void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter)
|
||||||
|
{
|
||||||
|
const uint32_t threadsperblock = 128;
|
||||||
|
|
||||||
|
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, g_hash, d_filter, 0);
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user