Add pentablake algo (-a penta)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
This commit is contained in:
Tanguy Pruvot 2014-09-06 21:54:46 +02:00
parent 95ac1d0f19
commit 402e416853
8 changed files with 637 additions and 4 deletions

View File

@ -33,7 +33,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \ quark/cuda_quark_compactionTest.cu \
cuda_nist5.cu blake32.cu \ cuda_nist5.cu blake32.cu pentablake.cu \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \ sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \

View File

@ -362,6 +362,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
blake256hash(vhashcpu, endiandata, blakerounds); blake256hash(vhashcpu, endiandata, blakerounds);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) { if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
applog(LOG_NOTICE, "GPU found more than one result yippee!"); applog(LOG_NOTICE, "GPU found more than one result yippee!");
rc = 2;
} else { } else {
extra_results[0] = MAXU; extra_results[0] = MAXU;
} }
@ -380,9 +381,14 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
} }
} }
if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
pdata[19] = max_nonce - first_nonce + 1;
break;
}
pdata[19] += throughput; pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart); } while (!work_restart[thr_id].restart);
exit_scan: exit_scan:
*hashes_done = pdata[19] - first_nonce + 1; *hashes_done = pdata[19] - first_nonce + 1;
@ -395,6 +401,6 @@ exit_scan:
} }
#endif #endif
// wait proper end of all threads // wait proper end of all threads
cudaDeviceSynchronize(); //cudaDeviceSynchronize();
return rc; return rc;
} }

View File

@ -405,6 +405,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
<FastMath>true</FastMath> <FastMath>true</FastMath>
</CudaCompile> </CudaCompile>
<CudaCompile Include="pentablake.cu">
<MaxRegCount>80</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
<FastMath>true</FastMath>
</CudaCompile>
<CudaCompile Include="quark\animecoin.cu"> <CudaCompile Include="quark\animecoin.cu">
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>

View File

@ -445,5 +445,8 @@
<CudaCompile Include="blake32.cu"> <CudaCompile Include="blake32.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="pentablake.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup> </ItemGroup>
</Project> </Project>

View File

@ -136,8 +136,9 @@ typedef enum {
ALGO_JACKPOT, ALGO_JACKPOT,
ALGO_MJOLLNIR, /* Mjollnir hash */ ALGO_MJOLLNIR, /* Mjollnir hash */
ALGO_MYR_GR, ALGO_MYR_GR,
ALGO_QUARK,
ALGO_NIST5, ALGO_NIST5,
ALGO_PENTABLAKE,
ALGO_QUARK,
ALGO_WHC, ALGO_WHC,
ALGO_X11, ALGO_X11,
ALGO_X13, ALGO_X13,
@ -159,6 +160,7 @@ static const char *algo_names[] = {
"mjollnir", "mjollnir",
"myr-gr", "myr-gr",
"nist5", "nist5",
"penta",
"quark", "quark",
"whirl", "whirl",
"x11", "x11",
@ -242,6 +244,7 @@ Options:\n\
mjollnir Mjollnircoin hash\n\ mjollnir Mjollnircoin hash\n\
myr-gr Myriad-Groestl hash\n\ myr-gr Myriad-Groestl hash\n\
nist5 NIST5 (TalkCoin) hash\n\ nist5 NIST5 (TalkCoin) hash\n\
penta Pentablake hash (5x Blake 512)\n\
quark Quark hash\n\ quark Quark hash\n\
whirl Whirlcoin (old whirlpool)\n\ whirl Whirlcoin (old whirlpool)\n\
x11 X11 (DarkCoin) hash\n\ x11 X11 (DarkCoin) hash\n\
@ -1089,6 +1092,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done); max_nonce, &hashes_done);
break; break;
case ALGO_PENTABLAKE:
rc = scanhash_pentablake(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_WHC: case ALGO_WHC:
rc = scanhash_whc(thr_id, work.data, work.target, rc = scanhash_whc(thr_id, work.data, work.target,
max_nonce, &hashes_done); max_nonce, &hashes_done);

View File

@ -249,6 +249,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_whc(int thr_id, uint32_t *pdata, extern int scanhash_whc(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
@ -284,6 +288,7 @@ struct work_restart {
char padding[128 - sizeof(unsigned long)]; char padding[128 - sizeof(unsigned long)];
}; };
extern bool opt_benchmark;
extern bool opt_debug; extern bool opt_debug;
extern bool opt_debug_rpc; extern bool opt_debug_rpc;
extern bool opt_quiet; extern bool opt_quiet;
@ -428,6 +433,7 @@ unsigned int jackpothash(void *state, const void *input);
void groestlhash(void *state, const void *input); void groestlhash(void *state, const void *input);
void myriadhash(void *state, const void *input); void myriadhash(void *state, const void *input);
void nist5hash(void *state, const void *input); void nist5hash(void *state, const void *input);
void pentablakehash(void *output, const void *input);
void quarkhash(void *state, const void *input); void quarkhash(void *state, const void *input);
void wcoinhash(void *state, const void *input); void wcoinhash(void *state, const void *input);
void x11hash(void *output, const void *input); void x11hash(void *output, const void *input);

600
pentablake.cu Normal file
View File

@ -0,0 +1,600 @@
/**
* Penta Blake-512 Cuda Kernel (Tested on SM 5.0)
*
* Tanguy Pruvot - Aug. 2014
*/
#include "miner.h"
extern "C" {
#include "sph/sph_blake.h"
#include <stdint.h>
#include <memory.h>
}
/* threads per block */
#define TPB 192
/* hash by cpu with blake 256 */
extern "C" void pentablakehash(void *output, const void *input)
{
unsigned char hash[128];
#define hashB hash + 64
sph_blake512_context ctx;
sph_blake512_init(&ctx);
sph_blake512(&ctx, input, 80);
sph_blake512_close(&ctx, hash);
sph_blake512(&ctx, hash, 64);
sph_blake512_close(&ctx, hashB);
sph_blake512(&ctx, hashB, 64);
sph_blake512_close(&ctx, hash);
sph_blake512(&ctx, hash, 64);
sph_blake512_close(&ctx, hashB);
sph_blake512(&ctx, hashB, 64);
sph_blake512_close(&ctx, hash);
memcpy(output, hash, 32);
}
#include "cuda_helper.h"
#define MAXU 0xffffffffU
// in cpu-miner.c
extern bool opt_n_threads;
extern bool opt_benchmark;
extern int device_map[8];
__constant__
static uint32_t __align__(32) c_Target[8];
__constant__
static uint64_t __align__(32) c_data[32];
static uint32_t *d_hash[8];
static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];
static uint32_t extra_results[2] = { MAXU, MAXU };
/* prefer uint32_t to prevent size conversions = speed +5/10 % */
__constant__
static uint32_t __align__(32) c_sigma[16][16];
const uint32_t host_sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
__device__ __constant__
static const uint64_t __align__(32) c_IV512[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL
};
__device__ __constant__
const uint64_t c_u512[16] =
{
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};
#define G(a,b,c,d,x) { \
uint32_t idx1 = c_sigma[i][x]; \
uint32_t idx2 = c_sigma[i][x+1]; \
v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 25); \
v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 11); \
}
// Hash-Padding
__device__ __constant__
static const uint64_t d_constHashPadding[8] = {
0x0000000000000080ull,
0,
0,
0,
0,
0x0100000000000000ull,
0,
0x0002000000000000ull
};
#if 0
__device__ __constant__
static const uint64_t __align__(32) c_Padding[16] = {
0, 0, 0, 0,
0x80000000ULL, 0, 0, 0,
0, 0, 0, 0,
0, 1, 0, 640,
};
__device__ static
void pentablake_compress(uint64_t *h, const uint64_t *block, const uint32_t T0)
{
uint64_t v[16], m[16];
m[0] = block[0];
m[1] = block[1];
m[2] = block[2];
m[3] = block[3];
for (uint32_t i = 4; i < 16; i++) {
m[i] = (T0 == 0x200) ? block[i] : c_Padding[i];
}
//#pragma unroll 8
for(uint32_t i = 0; i < 8; i++)
v[i] = h[i];
v[ 8] = c_u512[0];
v[ 9] = c_u512[1];
v[10] = c_u512[2];
v[11] = c_u512[3];
v[12] = xor1(c_u512[4], T0);
v[13] = xor1(c_u512[5], T0);
v[14] = c_u512[6];
v[15] = c_u512[7];
for (uint32_t i = 0; i < 16; i++) {
/* column step */
G(0, 4, 0x8, 0xC, 0x0);
G(1, 5, 0x9, 0xD, 0x2);
G(2, 6, 0xA, 0xE, 0x4);
G(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
G(0, 5, 0xA, 0xF, 0x8);
G(1, 6, 0xB, 0xC, 0xA);
G(2, 7, 0x8, 0xD, 0xC);
G(3, 4, 0x9, 0xE, 0xE);
}
//#pragma unroll 16
for (uint32_t i = 0; i < 16; i++) {
uint32_t j = i % 8;
h[j] ^= v[i];
}
}
__global__
void pentablake_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = startNounce + thread;
uint64_t h[8];
#pragma unroll
for(int i=0; i<8; i++) {
h[i] = c_IV512[i];
}
uint64_t ending[4];
ending[0] = c_data[16];
ending[1] = c_data[17];
ending[2] = c_data[18];
ending[3] = nounce; /* our tested value */
pentablake_compress(h, ending, 640);
// -----------------------------------
for (int r = 0; r < 4; r++) {
uint64_t data[8];
for (int i = 0; i < 7; i++) {
data[i] = h[i];
}
pentablake_compress(h, data, 512); /* todo: use h,h when ok*/
}
}
}
#endif
__device__ static
void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0)
{
uint64_t v[16], m[16], i;
#pragma unroll 16
for(i = 0; i < 16; i++) {
m[i] = cuda_swab64(block[i]);
}
#pragma unroll 8
for (i = 0; i < 8; i++)
v[i] = h[i];
v[ 8] = c_u512[0];
v[ 9] = c_u512[1];
v[10] = c_u512[2];
v[11] = c_u512[3];
v[12] = c_u512[4] ^ T0;
v[13] = c_u512[5] ^ T0;
v[14] = c_u512[6];
v[15] = c_u512[7];
//#pragma unroll 16
for( i = 0; i < 16; i++)
{
/* column step */
G(0, 4, 0x8, 0xC, 0x0);
G(1, 5, 0x9, 0xD, 0x2);
G(2, 6, 0xA, 0xE, 0x4);
G(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
G(0, 5, 0xA, 0xF, 0x8);
G(1, 6, 0xB, 0xC, 0xA);
G(2, 7, 0x8, 0xD, 0xC);
G(3, 4, 0x9, 0xE, 0xE);
}
//#pragma unroll 16
for (i = 0; i < 16; i++) {
uint32_t idx = i % 8;
h[idx] ^= v[i];
}
}
__global__
void pentablake_gpu_hash_80(int threads, const uint32_t startNounce, void *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t h[8];
uint64_t buf[16];
uint32_t nounce = startNounce + thread;
//#pragma unroll 8
for(int i=0; i<8; i++)
h[i] = c_IV512[i];
//#pragma unroll 16
for (int i=0; i < 16; i++)
buf[i] = c_data[i];
// The test Nonce
((uint32_t*)buf)[19] = cuda_swab32(nounce);
pentablake_compress(h, buf, 640ULL);
#if __CUDA_ARCH__ < 300
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 8
for (uint32_t i=0; i < 8; i++) {
outHash[2*i] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
for (uint32_t i=0; i < 8; i++) {
outHash[i] = cuda_swab64( h[i] );
}
#endif
}
}
__host__
void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce, uint32_t *d_outputHash, int order)
{
const int threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
pentablake_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id);
cudaDeviceSynchronize();
}
__global__
void pentablake_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t *inpHash = &g_hash[thread<<3]; // hashPosition * 8
uint64_t buf[16]; // 128 Bytes
uint64_t h[8]; // State
#pragma unroll 8
for (int i=0; i<8; i++)
h[i] = c_IV512[i];
// Message for first round
#pragma unroll 8
for (int i=0; i < 8; ++i)
buf[i] = inpHash[i];
#pragma unroll 8
for (int i=0; i < 8; i++)
buf[i+8] = d_constHashPadding[i];
// Ending round
pentablake_compress(h, buf, 512);
#if __CUDA_ARCH__ < 300
uint32_t *outHash = (uint32_t*)&g_hash[thread<<3];
#pragma unroll 8
for (int i=0; i < 8; i++) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
uint64_t *outHash = &g_hash[thread<<3];
for (int i=0; i < 8; i++) {
outHash[i] = cuda_swab64(h[i]);
}
#endif
}
}
__host__
void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{
const int threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
pentablake_gpu_hash_64 <<<grid, block, shared_size>>> (threads, startNounce, (uint64_t*)d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id);
cudaDeviceSynchronize();
}
#if 0
__host__
uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
{
const int threadsperblock = TPB;
uint32_t result = MAXU;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
/* Check error on Ctrl+C or kill to prevent segfaults on exit */
if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
return result;
pentablake_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id]);
cudaDeviceSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
cudaThreadSynchronize();
result = h_resNounce[thr_id][0];
extra_results[0] = h_resNounce[thr_id][1];
}
return result;
}
#endif
__global__
void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
uint32_t *inpHash = &g_hash[thread<<4];
uint32_t h[8];
#pragma unroll 8
for (int i=0; i < 8; i++)
h[i] = inpHash[i];
for (int i = 7; i >= 0; i--) {
uint32_t hash = h[i]; // cuda_swab32(h[i]);
if (hash > c_Target[i]) {
return;
}
if (hash < c_Target[i]) {
break;
}
}
/* keep the smallest nounce, + extra one if found */
if (resNounce[0] > nounce) {
resNounce[1] = resNounce[0];
resNounce[0] = nounce;
}
else
resNounce[1] = nounce;
}
}
__host__ static
uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
{
const int threadsperblock = TPB;
uint32_t result = MAXU;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
/* Check error on Ctrl+C or kill to prevent segfaults on exit */
if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
return result;
pentablake_gpu_check_hash <<<grid, block, shared_size>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
cudaThreadSynchronize();
result = h_resNounce[thr_id][0];
extra_results[0] = h_resNounce[thr_id][1];
}
return result;
}
__host__
void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
{
uint8_t data[128];
memcpy((void*) data, (void*) pdata, 80);
memset(data+80, 0, 48);
// to swab...
data[80] = 0x80;
data[111] = 1;
data[126] = 0x02;
data[127] = 0x80;
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
}
extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
uint32_t throughput = min(128 * 2560, max_nonce - first_nonce);
uint32_t endiandata[20];
int rc = 0;
if (extra_results[0] != MAXU) {
// possible extra result found in previous call
if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
pdata[19] = extra_results[0];
*hashes_done = pdata[19] - first_nonce + 1;
extra_results[0] = MAXU;
rc = 1;
goto exit_scan;
}
}
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000F;
if (!init[thr_id]) {
if (opt_n_threads > 1) {
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
init[thr_id] = true;
}
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
pentablake_cpu_setBlock_80(endiandata, ptarget);
do {
int order = 0;
// GPU HASH
pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
if (foundNonce != MAXU)
{
uint32_t vhashcpu[8];
uint32_t Htarg = ptarget[7];
be32enc(&endiandata[19], foundNonce);
pentablakehash(vhashcpu, endiandata);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
{
pdata[19] = foundNonce;
rc = 1;
// Rare but possible if the throughput is big
be32enc(&endiandata[19], extra_results[0]);
pentablakehash(vhashcpu, endiandata);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
applog(LOG_NOTICE, "GPU found more than one result yippee!");
rc = 2;
} else {
extra_results[0] = MAXU;
}
goto exit_scan;
}
else if (vhashcpu[7] > Htarg) {
applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
}
else if (vhashcpu[6] > ptarget[6]) {
applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]);
}
else {
applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
exit_scan:
*hashes_done = pdata[19] - first_nonce + 1;
#if 0
/* reset the device to allow multiple instances
* could be made in cpu-miner... check later if required */
if (opt_n_threads == 1) {
CUDA_SAFE_CALL(cudaDeviceReset());
init[thr_id] = false;
}
#endif
cudaDeviceSynchronize();
return rc;
}

4
util.c
View File

@ -1457,6 +1457,10 @@ void print_hash_tests(void)
nist5hash(&hash[0], &buf[0]); nist5hash(&hash[0], &buf[0]);
printpfx("nist5", hash); printpfx("nist5", hash);
memset(hash, 0, sizeof hash);
pentablakehash(&hash[0], &buf[0]);
printpfx("pentablake", hash);
memset(hash, 0, sizeof hash); memset(hash, 0, sizeof hash);
quarkhash(&hash[0], &buf[0]); quarkhash(&hash[0], &buf[0]);
printpfx("quark", hash); printpfx("quark", hash);