Browse Source

phi2 algo

pull/5/head
Tanguy Pruvot 7 years ago
parent
commit
3d03a1b9fd
  1. 2
      Makefile.am
  2. 2
      algos.h
  3. 8
      ccminer.cpp
  4. 5
      ccminer.vcxproj
  5. 20
      ccminer.vcxproj.filters
  6. 122
      lyra2/cuda_lyra2.cu
  7. 65
      lyra2/cuda_lyra2_sm2.cuh
  8. 64
      lyra2/cuda_lyra2_sm5.cuh
  9. 5
      miner.h
  10. 89
      phi/cuda_phi2.cu
  11. 8
      phi/phi.cu
  12. 255
      phi/phi2.cu
  13. 2
      util.cpp
  14. 21
      x11/cuda_streebog_maxwell.cu
  15. 26
      x16/cuda_x16_echo512_64.cu

2
Makefile.am

@ -81,7 +81,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \ x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
x16/cuda_x16_echo512_64.cu \ x16/cuda_x16_echo512_64.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
x11/phi.cu x11/cuda_streebog_maxwell.cu \ phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
# scrypt # scrypt

2
algos.h

@ -39,6 +39,7 @@ enum sha_algos {
ALGO_NIST5, ALGO_NIST5,
ALGO_PENTABLAKE, ALGO_PENTABLAKE,
ALGO_PHI, ALGO_PHI,
ALGO_PHI2,
ALGO_POLYTIMOS, ALGO_POLYTIMOS,
ALGO_QUARK, ALGO_QUARK,
ALGO_QUBIT, ALGO_QUBIT,
@ -112,6 +113,7 @@ static const char *algo_names[] = {
"nist5", "nist5",
"penta", "penta",
"phi", "phi",
"phi2",
"polytimos", "polytimos",
"quark", "quark",
"qubit", "qubit",

8
ccminer.cpp

@ -269,7 +269,8 @@ Options:\n\
neoscrypt FeatherCoin, Phoenix, UFO...\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\
nist5 NIST5 (TalkCoin)\n\ nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\ penta Pentablake hash (5x Blake 512)\n\
phi BHCoin\n\ phi LUX initial algo\n\
phi2 LUX v2 with lyra2\n\
polytimos Politimos\n\ polytimos Politimos\n\
quark Quark\n\ quark Quark\n\
qubit Qubit\n\ qubit Qubit\n\
@ -1708,6 +1709,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_LBRY: case ALGO_LBRY:
case ALGO_LYRA2v2: case ALGO_LYRA2v2:
case ALGO_LYRA2Z: case ALGO_LYRA2Z:
case ALGO_PHI2:
case ALGO_TIMETRAVEL: case ALGO_TIMETRAVEL:
case ALGO_BITCORE: case ALGO_BITCORE:
case ALGO_X16R: case ALGO_X16R:
@ -2245,6 +2247,7 @@ static void *miner_thread(void *userdata)
case ALGO_HSR: case ALGO_HSR:
case ALGO_LYRA2v2: case ALGO_LYRA2v2:
case ALGO_PHI: case ALGO_PHI:
case ALGO_PHI2:
case ALGO_POLYTIMOS: case ALGO_POLYTIMOS:
case ALGO_S3: case ALGO_S3:
case ALGO_SKUNK: case ALGO_SKUNK:
@ -2436,6 +2439,9 @@ static void *miner_thread(void *userdata)
case ALGO_PHI: case ALGO_PHI:
rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
break; break;
case ALGO_PHI2:
rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_POLYTIMOS: case ALGO_POLYTIMOS:
rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done);
break; break;

5
ccminer.vcxproj

@ -525,6 +525,7 @@
<CudaCompile Include="lyra2\lyra2REv2.cu" /> <CudaCompile Include="lyra2\lyra2REv2.cu" />
<CudaCompile Include="lyra2\cuda_lyra2v2.cu" /> <CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" /> <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
<ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" /> <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
<CudaCompile Include="lyra2\lyra2Z.cu" /> <CudaCompile Include="lyra2\lyra2Z.cu" />
<CudaCompile Include="lyra2\cuda_lyra2Z.cu" /> <CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
@ -537,6 +538,9 @@
<CudaCompile Include="cuda_skeincoin.cu"> <CudaCompile Include="cuda_skeincoin.cu">
<MaxRegCount>48</MaxRegCount> <MaxRegCount>48</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile Include="phi\phi.cu" />
<CudaCompile Include="phi\phi2.cu" />
<CudaCompile Include="phi\cuda_phi2.cu" />
<CudaCompile Include="skunk\skunk.cu" /> <CudaCompile Include="skunk\skunk.cu" />
<CudaCompile Include="skunk\cuda_skunk.cu"> <CudaCompile Include="skunk\cuda_skunk.cu">
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration> <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
@ -567,7 +571,6 @@
<CudaCompile Include="x11\cuda_streebog_maxwell.cu" /> <CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
<CudaCompile Include="x11\c11.cu" /> <CudaCompile Include="x11\c11.cu" />
<CudaCompile Include="x11\fresh.cu" /> <CudaCompile Include="x11\fresh.cu" />
<CudaCompile Include="x11\phi.cu" />
<CudaCompile Include="x11\sib.cu" /> <CudaCompile Include="x11\sib.cu" />
<CudaCompile Include="x11\s3.cu" /> <CudaCompile Include="x11\s3.cu" />
<CudaCompile Include="x11\timetravel.cu" /> <CudaCompile Include="x11\timetravel.cu" />

20
ccminer.vcxproj.filters

@ -115,7 +115,10 @@
<Filter Include="Source Files\CUDA\tribus"> <Filter Include="Source Files\CUDA\tribus">
<UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier> <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Source Files\CUDA\x12"> <Filter Include="Source Files\CUDA\phi">
<UniqueIdentifier>{311e8d79-1612-4f0f-8591-23a592f2b2d3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\x12">
<UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier> <UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
</Filter> </Filter>
</ItemGroup> </ItemGroup>
@ -545,6 +548,9 @@
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh"> <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
<Filter>Source Files\CUDA\lyra2</Filter> <Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
<Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude>
<ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh"> <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
<Filter>Source Files\CUDA\lyra2</Filter> <Filter>Source Files\CUDA\lyra2</Filter>
</ClInclude> </ClInclude>
@ -781,6 +787,15 @@
<CudaCompile Include="polytimos.cu"> <CudaCompile Include="polytimos.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="phi\phi.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="phi\phi2.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="phi\cuda_phi2.cu">
<Filter>Source Files\CUDA\phi</Filter>
</CudaCompile>
<CudaCompile Include="skunk\skunk.cu"> <CudaCompile Include="skunk\skunk.cu">
<Filter>Source Files\CUDA\skunk</Filter> <Filter>Source Files\CUDA\skunk</Filter>
</CudaCompile> </CudaCompile>
@ -799,9 +814,6 @@
<ClInclude Include="tribus\cuda_echo512_aes.cuh"> <ClInclude Include="tribus\cuda_echo512_aes.cuh">
<Filter>Source Files\CUDA\tribus</Filter> <Filter>Source Files\CUDA\tribus</Filter>
</ClInclude> </ClInclude>
<CudaCompile Include="x11\phi.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\sib.cu"> <CudaCompile Include="x11\sib.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\x11</Filter>
</CudaCompile> </CudaCompile>

122
lyra2/cuda_lyra2.cu

@ -1,6 +1,7 @@
/** /**
* Lyra2 (v1) cuda implementation based on djm34 work * Lyra2 (v1) cuda implementation based on djm34 work
* tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2) * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
* tpruvot@github 2018 for phi2 double lyra2-32 support
*/ */
#include <stdio.h> #include <stdio.h>
@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
{ {
uint2 state1[3]; uint2 state1[3];
#if __CUDA_ARCH__ > 500 #pragma unroll
#pragma unroll
#endif
for (int i = 0; i < Nrow; i++) for (int i = 0; i < Nrow; i++)
{ {
ST4S(0, Ncol - i - 1, state, thread, threads); ST4S(0, Ncol - i - 1, state, thread, threads);
@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
LD4S(state1, rowIn, i, thread, threads); LD4S(state1, rowIn, i, thread, threads);
LD4S(state2, rowInOut, i, thread, threads); LD4S(state2, rowInOut, i, thread, threads);
#pragma unroll #pragma unroll
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
state[j] ^= state1[j] + state2[j]; state[j] ^= state1[j] + state2[j];
@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
LD4S(state1, rowOut, i, thread, threads); LD4S(state1, rowOut, i, thread, threads);
#pragma unroll #pragma unroll
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
state1[j] ^= state[j]; state1[j] ^= state[j];
@ -412,11 +411,9 @@ __global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint2x4 state[4]; uint2x4 state[4];
state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
__global__ __global__
__launch_bounds__(TPB52, 1) __launch_bounds__(TPB52, 1)
void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash)
{ {
const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
if (thread < threads) if (thread < threads)
{ {
uint2 state[4]; uint2 state[4];
@ -484,11 +480,9 @@ __global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
{ {
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
uint28 state[4];
if (thread < threads) if (thread < threads)
{ {
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
@ -501,7 +495,57 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
g_hash[thread + threads * 1] = state[0].y; g_hash[thread + threads * 1] = state[0].y;
g_hash[thread + threads * 2] = state[0].z; g_hash[thread + threads * 2] = state[0].z;
g_hash[thread + threads * 3] = state[0].w; g_hash[thread + threads * 3] = state[0].w;
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *psrc = (uint2*)(&d_hash_512[offset]);
state[0].x = state[1].x = __ldg(&psrc[0]);
state[0].y = state[1].y = __ldg(&psrc[1]);
state[0].z = state[1].z = __ldg(&psrc[2]);
state[0].w = state[1].w = __ldg(&psrc[3]);
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<24; i++)
round_lyra(state);
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
{
// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
for (int i = 0; i < 12; i++)
round_lyra(state);
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *pdst = (uint2*)(&d_hash_512[offset]);
pdst[0] = state[0].x;
pdst[1] = state[0].y;
pdst[2] = state[0].z;
pdst[3] = state[0].w;
} }
} }
#else #else
@ -513,6 +557,8 @@ __device__ void* DMatrix;
__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {} __global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
#endif #endif
__host__ __host__
@ -545,9 +591,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7
if (cuda_arch[dev_id] >= 520) if (cuda_arch[dev_id] >= 520)
{ {
lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash); lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash); lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);
lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash); lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
} }
else if (cuda_arch[dev_id] >= 500) else if (cuda_arch[dev_id] >= 500)
@ -562,11 +606,57 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx7
shared_mem = 6144; shared_mem = 6144;
lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash); lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);
lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
} }
else else
lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash); lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
} }
__host__
void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti)
{
int dev_id = device_map[thr_id % MAX_GPUS];
uint32_t tpb = TPB52;
if (cuda_arch[dev_id] >= 520) tpb = TPB52;
else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb);
dim3 block1(4, tpb >> 2);
dim3 grid2((threads + 64 - 1) / 64);
dim3 block2(64);
if (cuda_arch[dev_id] >= 520)
{
const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152;
lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
}
else if (cuda_arch[dev_id] >= 500)
{
size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps
lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
}
else {
// alternative method for SM 3.x
hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
}
}

65
lyra2/cuda_lyra2_sm2.cuh

@ -3,7 +3,7 @@
#ifdef __INTELLISENSE__ #ifdef __INTELLISENSE__
/* just for vstudio code colors, only uncomment that temporary, dont commit it */ /* just for vstudio code colors, only uncomment that temporary, dont commit it */
//#undef __CUDA_ARCH__ //#undef __CUDA_ARCH__
//#define __CUDA_ARCH__ 500 //#define __CUDA_ARCH__ 300
#endif #endif
#include "cuda_helper.h" #include "cuda_helper.h"
@ -226,3 +226,66 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash)
/* if __CUDA_ARCH__ < 200 .. host */ /* if __CUDA_ARCH__ < 200 .. host */
__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {} __global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {}
#endif #endif
// -------------------------------------------------------------------------------------------------------------------------
// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo...
#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350
__global__ __launch_bounds__(128, 8)
void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const size_t offset = (size_t) 16 * thread + (round * 8U);
uint2 *psrc = (uint2*) (&d_hash64[offset]);
uint2 *pdst = (uint2*) (&d_hash_lyra[thread]);
pdst[threads*0] = __ldg(&psrc[0]);
pdst[threads*1] = __ldg(&psrc[1]);
pdst[threads*2] = __ldg(&psrc[2]);
pdst[threads*3] = __ldg(&psrc[3]);
}
}
__global__ __launch_bounds__(128, 8)
void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const size_t offset = (size_t) 16 * thread + (round * 8U);
uint2 *psrc = (uint2*) (&d_hash_lyra[thread]);
uint2 *pdst = (uint2*) (&d_hash64[offset]);
pdst[0] = psrc[0];
pdst[1] = psrc[threads*1];
pdst[2] = psrc[threads*2];
pdst[3] = psrc[threads*3];
}
}
#else
/* if __CUDA_ARCH__ < 200 .. host */
__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
#endif
__host__
void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
hash64_to_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
}
__host__
void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
hash64_from_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
}

64
lyra2/cuda_lyra2_sm5.cuh

@ -591,13 +591,12 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr
__global__ __launch_bounds__(64, 1) __global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint2x4 blake2b_IV[2] = { const uint2x4 blake2b_IV[2] = {
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
}; };
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
uint2x4 state[4]; uint2x4 state[4];
@ -629,7 +628,6 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash)
if (thread < threads) if (thread < threads)
{ {
uint2 state[4]; uint2 state[4];
state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]); state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]); state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]); state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
@ -669,7 +667,6 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
if (thread < threads) if (thread < threads)
{ {
uint2x4 state[4]; uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]); state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]); state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]); state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
} }
} }
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
{
const uint2x4 blake2b_IV[2] = {
{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
};
// This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint2x4 state[4];
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *psrc = (uint2*)(&d_hash_512[offset]);
state[0].x = state[1].x = __ldg(&psrc[0]);
state[0].y = state[1].y = __ldg(&psrc[1]);
state[0].z = state[1].z = __ldg(&psrc[2]);
state[0].w = state[1].w = __ldg(&psrc[3]);
state[1] = state[0];
state[2] = blake2b_IV[0];
state[3] = blake2b_IV[1];
for (int i = 0; i<24; i++)
round_lyra(state);
((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
}
}
__global__ __launch_bounds__(64, 1)
void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
{
// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
uint2x4 state[4];
state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
for (int i = 0; i < 12; i++)
round_lyra(state);
const size_t offset = (size_t)8 * thread + (round * 4U);
uint2 *pdst = (uint2*)(&d_hash_512[offset]);
pdst[0] = state[0].x;
pdst[1] = state[0].y;
pdst[2] = state[0].z;
pdst[3] = state[0].w;
}
}
#else #else
/* if __CUDA_ARCH__ != 500 .. host */ /* if __CUDA_ARCH__ != 500 .. host */
__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {} __global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {}
__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
#endif #endif

5
miner.h

@ -303,6 +303,7 @@ extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce,
extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -371,6 +372,7 @@ extern void free_neoscrypt(int thr_id);
extern void free_nist5(int thr_id); extern void free_nist5(int thr_id);
extern void free_pentablake(int thr_id); extern void free_pentablake(int thr_id);
extern void free_phi(int thr_id); extern void free_phi(int thr_id);
extern void free_phi2(int thr_id);
extern void free_polytimos(int thr_id); extern void free_polytimos(int thr_id);
extern void free_quark(int thr_id); extern void free_quark(int thr_id);
extern void free_qubit(int thr_id); extern void free_qubit(int thr_id);
@ -918,7 +920,8 @@ void myriadhash(void *state, const void *input);
void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
void nist5hash(void *state, const void *input); void nist5hash(void *state, const void *input);
void pentablakehash(void *output, const void *input); void pentablakehash(void *output, const void *input);
void phihash(void *output, const void *input); void phi_hash(void *output, const void *input);
void phi2_hash(void *output, const void *input);
void polytimos_hash(void *output, const void *input); void polytimos_hash(void *output, const void *input);
void quarkhash(void *state, const void *input); void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input); void qubithash(void *state, const void *input);

89
phi/cuda_phi2.cu

@ -0,0 +1,89 @@
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
__global__ __launch_bounds__(128, 8)
void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
uint4 *psrc = (uint4*) (&d_hash[offset]);
d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1;
if (d_NonceBranch[thread]) return;
if (d_branch2) {
uint4 *pdst = (uint4*)(&d_branch2[offset]);
uint4 data;
data = psrc[0]; pdst[0] = data;
data = psrc[1]; pdst[1] = data;
data = psrc[2]; pdst[2] = data;
data = psrc[3]; pdst[3] = data;
}
}
}
__global__ __launch_bounds__(128, 8)
void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads && !d_NonceBranch[thread])
{
const uint32_t offset = thread * 16U;
uint4 *psrc = (uint4*) (&d_branch2[offset]);
uint4 *pdst = (uint4*) (&d_hash[offset]);
uint4 data;
data = psrc[0]; pdst[0] = data;
data = psrc[1]; pdst[1] = data;
data = psrc[2]; pdst[2] = data;
data = psrc[3]; pdst[3] = data;
}
}
__global__
void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t offset = thread * 16U;
uint2 *psrc = (uint2*) (&d_hash[offset]);
uint2 *pdst = (uint2*) (&d_hash[offset]);
uint2 data;
data = psrc[4]; pdst[0] ^= data;
data = psrc[5]; pdst[1] ^= data;
data = psrc[6]; pdst[2] ^= data;
data = psrc[7]; pdst[3] ^= data;
}
}
__host__
uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// extract algo permution hashes to a second branch buffer
phi_filter_gpu <<<grid, block>>> (threads, inpHashes, d_br2, d_nonces);
return threads;
}
__host__
void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
// put back second branch hashes to the common buffer d_hash
phi_merge_gpu <<<grid, block>>> (threads, outpHashes, d_br2, d_nonces);
}
__host__
void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
phi_final_compress_gpu <<<grid, block>>> (threads, d_hashes);
}

8
x11/phi.cu → phi/phi.cu

@ -19,7 +19,7 @@ extern "C" {
#include "miner.h" #include "miner.h"
#include "cuda_helper.h" #include "cuda_helper.h"
#include "cuda_x11.h" #include "x11/cuda_x11.h"
extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_setBlock_80(void *pdata);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash,
static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS]; static uint32_t *d_resNonce[MAX_GPUS];
extern "C" void phihash(void *output, const void *input) extern "C" void phi_hash(void *output, const void *input)
{ {
unsigned char _ALIGN(128) hash[128] = { 0 }; unsigned char _ALIGN(128) hash[128] = { 0 };
@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
uint32_t _ALIGN(64) vhash[8]; uint32_t _ALIGN(64) vhash[8];
if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce; if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
be32enc(&endiandata[19], work->nonces[0]); be32enc(&endiandata[19], work->nonces[0]);
phihash(vhash, endiandata); phi_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1; work->valid_nonces = 1;
@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
if (work->nonces[1] != UINT32_MAX) { if (work->nonces[1] != UINT32_MAX) {
work->nonces[1] += startNonce; work->nonces[1] += startNonce;
be32enc(&endiandata[19], work->nonces[1]); be32enc(&endiandata[19], work->nonces[1]);
phihash(vhash, endiandata); phi_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1); bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++; work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;

255
phi/phi2.cu

@ -0,0 +1,255 @@
//
// PHI2 algo
// CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein
//
// Implemented by tpruvot in May 2018
//
extern "C" {
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_streebog.h"
#include "sph/sph_echo.h"
#include "lyra2/Lyra2.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include "x11/cuda_x11.h"
#include <stdio.h>
#include <memory.h>
extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti);
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter);
extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter);
extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces);
extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces);
extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes);
static uint64_t* d_matrix[MAX_GPUS];
static uint32_t* d_hash_512[MAX_GPUS];
static uint64_t* d_hash_256[MAX_GPUS];
static uint32_t* d_hash_br2[MAX_GPUS];
static uint32_t* d_nonce_br[MAX_GPUS];
extern "C" void phi2_hash(void *output, const void *input)
{
unsigned char _ALIGN(128) hash[128] = { 0 };
unsigned char _ALIGN(128) hashA[64] = { 0 };
unsigned char _ALIGN(128) hashB[64] = { 0 };
sph_cubehash512_context ctx_cubehash;
sph_jh512_context ctx_jh;
sph_gost512_context ctx_gost;
sph_echo512_context ctx_echo;
sph_skein512_context ctx_skein;
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, input, 80);
sph_cubehash512_close(&ctx_cubehash, (void*)hashB);
LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8);
LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8);
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, (const void*)hashA, 64);
sph_jh512_close(&ctx_jh, (void*)hash);
if (hash[0] & 1) {
sph_gost512_init(&ctx_gost);
sph_gost512(&ctx_gost, (const void*)hash, 64);
sph_gost512_close(&ctx_gost, (void*)hash);
} else {
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
}
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, (const void*)hash, 64);
sph_skein512_close(&ctx_skein, (void*)hash);
for (int i=0; i<32; i++)
hash[i] ^= hash[i+32];
memcpy(output, hash, 32);
}
//#define _DEBUG
#define _DEBUG_PREFIX "phi-"
#include "cuda_debug.cuh"
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
static __thread bool gtx750ti = false;
extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
if (device_sm[dev_id] == 500) intensity = 15;
if (device_sm[dev_id] == 600) intensity = 17;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem
if (opt_benchmark)
ptarget[7] = 0xff;
if (!init[thr_id])
{
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL);
size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1);
if (use_compat_kernels[thr_id]) {
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1);
}
x11_cubehash512_cpu_init(thr_id, throughput);
lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k = 0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
cuda_check_cpu_setTarget(ptarget);
cubehash512_setBlock_80(thr_id, endiandata);
do {
int order = 0;
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]); order++;
TRACE("cube ");
lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti);
order++;
TRACE("lyra ");
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
TRACE("jh ");
order++;
if (!use_compat_kernels[thr_id]) {
phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]);
phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
} else {
// todo: nonces vector to reduce amount of hashes to compute
phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
}
TRACE("mix ");
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
TRACE("skein ");
phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]);
TRACE("xor ");
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
phi2_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
phi2_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
if (pdata[19] > max_nonce) pdata[19] = max_nonce;
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_phi2(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_matrix[thr_id]);
cudaFree(d_hash_512[thr_id]);
cudaFree(d_hash_256[thr_id]);
cudaFree(d_nonce_br[thr_id]);
if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]);
cuda_check_cpu_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}

2
util.cpp

@ -2250,7 +2250,7 @@ void print_hash_tests(void)
pentablakehash(&hash[0], &buf[0]); pentablakehash(&hash[0], &buf[0]);
printpfx("pentablake", hash); printpfx("pentablake", hash);
phihash(&hash[0], &buf[0]); phi2_hash(&hash[0], &buf[0]);
printpfx("phi", hash); printpfx("phi", hash);
polytimos_hash(&hash[0], &buf[0]); polytimos_hash(&hash[0], &buf[0]);

21
x11/cuda_streebog_maxwell.cu

@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3)
#else #else
__launch_bounds__(TPB, 3) __launch_bounds__(TPB, 3)
#endif #endif
void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) void streebog_gpu_hash_64_sm5(uint64_t *g_hash, uint32_t* const d_filter, const uint32_t filter_val)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint2 buf[8], t[8], temp[8], K0[8], hash[8]; uint2 buf[8], t[8], temp[8], K0[8], hash[8];
@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]); shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]); shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
//__threadfence_block();
__syncthreads();
if (d_filter && d_filter[thread] != filter_val) return;
uint64_t* inout = &g_hash[thread<<3]; uint64_t* inout = &g_hash[thread<<3];
*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]); *(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]); *(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
__threadfence_block();
K0[0] = vectorize(0x74a5d4ce2efc83b3); K0[0] = vectorize(0x74a5d4ce2efc83b3);
#pragma unroll 8 #pragma unroll 8
@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
} }
__host__ __host__
void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash) void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *g_hash)
{
dim3 grid((threads + TPB-1) / TPB);
dim3 block(TPB);
streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, NULL, 0);
}
__host__
void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter)
{ {
dim3 grid((threads + TPB-1) / TPB); dim3 grid((threads + TPB-1) / TPB);
dim3 block(TPB); dim3 block(TPB);
streebog_gpu_hash_64_maxwell <<<grid, block>>> ((uint64_t*)d_hash); streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, d_filter, 1);
} }

26
x16/cuda_x16_echo512_64.cu

@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W,
} }
__global__ __launch_bounds__(128, 5) /* will force 80 registers */ __global__ __launch_bounds__(128, 5) /* will force 80 registers */
static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash) static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val)
{ {
__shared__ uint32_t sharedMemory[4][256]; __shared__ uint32_t sharedMemory[4][256];
aes_gpu_init128(sharedMemory); aes_gpu_init128(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t k0; uint32_t k0;
@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
uint32_t hash[16]; uint32_t hash[16];
if (thread < threads) if (thread < threads)
{ {
// phi2 filter (2 hash chain branches)
if (d_filter && d_filter[thread] != filter_val) return;
uint32_t *Hash = &g_hash[thread<<4]; uint32_t *Hash = &g_hash[thread<<4];
*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]); *(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0]; *(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8]; *(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
__syncthreads();
const uint32_t P[48] = { const uint32_t P[48] = {
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//8-12 //8-12
@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
W[48 + i + 4] = a ^ cd ^ bcx; W[48 + i + 4] = a ^ cd ^ bcx;
W[48 + i + 8] = d ^ ab ^ cdx; W[48 + i + 8] = d ^ ab ^ cdx;
W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
} }
for (int k = 1; k < 10; k++) for (int k = 1; k < 10; k++)
@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
} }
__host__ __host__
void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
{
const uint32_t threadsperblock = 128; const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
x16_echo512_gpu_hash_64<<<grid, block>>>(threads, d_hash); x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, d_hash, NULL, 0);
} }
__host__
void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, g_hash, d_filter, 0);
}
Loading…
Cancel
Save