Browse Source

Add zr5 algo (for SM 3.5+)

uint4 copy + keccak cleanup, groestl: small uint4 opt

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
master
Tanguy Pruvot 10 years ago
parent
commit
a37e909db9
  1. 281
      JHA/cuda_jha_keccak512.cu
  2. 2
      Makefile.am
  3. 12
      README.txt
  4. 74
      ccminer.cpp
  5. 12
      ccminer.vcxproj
  6. 9
      ccminer.vcxproj.filters
  7. 2
      configure.ac
  8. 30
      cpuminer-config.h
  9. 5
      miner.h
  10. 15
      quark/cuda_quark_groestl512.cu
  11. 39
      util.cpp
  12. 342
      zr5.cu

281
JHA/cuda_jha_keccak512.cu

@ -3,8 +3,14 @@ @@ -3,8 +3,14 @@
#include "cuda_helper.h"
__constant__ uint64_t c_State[25];
// ZR5
__constant__ uint32_t d_OriginalData[20];
__constant__ uint32_t c_PaddedMessage[18];
__constant__ uint64_t c_State[25];
#define POK_DATA_MASK 0xFFFF0000
#define POK_VERSION 0x1
#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
@ -35,7 +41,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const @@ -35,7 +41,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
uint64_t t[5], u[5], v, w;
/* absorb input */
#pragma unroll 9
#pragma unroll 9
for (i = 0; i < 72 / 8; i++, in += 2)
s[i] ^= U32TO64_LE(in);
@ -100,52 +106,9 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const @@ -100,52 +106,9 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
}
}
__global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
int hashPosition = nounce - startNounce;
// Nachricht kopieren
uint32_t message[18];
#pragma unroll 18
for(int i=0;i<18;i++)
message[i] = c_PaddedMessage[i];
// die individuelle Nounce einsetzen
message[1] = cuda_swab32(nounce);
// State initialisieren
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++)
keccak_gpu_state[i] = c_State[i];
// den Block einmal gut durchschütteln
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
// das Hash erzeugen
uint32_t hash[16];
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
}
// fertig
uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 16
for(int i=0;i<16;i++)
outpHash[i] = hash[i];
}
}
// Setup-Funktionen
__host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
__host__
void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants,
@ -174,10 +137,8 @@ __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads) @@ -174,10 +137,8 @@ __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
#else
#define ROL_mult8(a, offset) ROL(a, offset)
#endif
void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount );
const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] =
{
const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = {
(tKeccakLane)0x0000000000000001ULL,
(tKeccakLane)0x0000000000008082ULL,
(tKeccakLane)0x800000000000808aULL,
@ -196,29 +157,25 @@ const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = @@ -196,29 +157,25 @@ const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] =
(tKeccakLane)0x8000000000008003ULL,
(tKeccakLane)0x8000000000008002ULL,
(tKeccakLane)0x8000000000000080ULL
#if (cKeccakB >= 400)
#if (cKeccakB >= 400)
, (tKeccakLane)0x000000000000800aULL,
(tKeccakLane)0x800000008000000aULL
#if (cKeccakB >= 800)
#if (cKeccakB >= 800)
, (tKeccakLane)0x8000000080008081ULL,
(tKeccakLane)0x8000000000008080ULL
#if (cKeccakB == 1600)
#if (cKeccakB == 1600)
, (tKeccakLane)0x0000000080000001ULL,
(tKeccakLane)0x8000000080008008ULL
#endif
#endif
#endif
#endif
#endif
#endif
};
void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount )
void KeccakF(tKeccakLane * state, const tKeccakLane *in, int laneCount)
{
{
while ( --laneCount >= 0 )
{
while ( --laneCount >= 0 ) {
state[laneCount] ^= in[laneCount];
}
}
{
tKeccakLane Aba, Abe, Abi, Abo, Abu;
@ -487,7 +444,8 @@ void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount ) @@ -487,7 +444,8 @@ void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount )
}
// inlen kann 72...143 betragen
__host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
__host__
void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
{
const unsigned char *in = (const unsigned char*)pdata;
@ -496,43 +454,218 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen) @@ -496,43 +454,218 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
memset( state, 0, sizeof(state) );
for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes )
for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes)
{
KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) );
}
// Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten)
// ins Constant Memory
// Copy state of the first round (72 Bytes)
// in Constant Memory
cudaMemcpyToSymbol( c_State,
state,
sizeof(state),
0, cudaMemcpyHostToDevice);
// padding
memcpy( temp, in, (size_t)inlen );
// second part
memcpy(temp, in, inlen);
temp[inlen++] = 1;
memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen );
memset(temp + inlen, 0, cKeccakR_SizeInBytes - inlen);
temp[cKeccakR_SizeInBytes-1] |= 0x80;
// Kopiere den Rest der Message und das Padding ins Constant Memory
// Copy rest of the message in constant memory
cudaMemcpyToSymbol( c_PaddedMessage,
temp,
cKeccakR_SizeInBytes,
0, cudaMemcpyHostToDevice);
}
__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
__global__
void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
int hashPosition = nounce - startNounce;
uint32_t message[18];
#pragma unroll 18
for(int i=0;i<18;i++)
message[i] = c_PaddedMessage[i];
message[1] = cuda_swab32(nounce);
// State init
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++)
keccak_gpu_state[i] = c_State[i];
// den Block einmal gut durchschütteln
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
uint32_t hash[16];
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
}
// copy hash
uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 16
for(int i=0;i<16;i++)
outpHash[i] = hash[i];
}
}
__host__
void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
{
const uint32_t threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id);
}
/* zr5 keccak, no nonce swab32 */
__global__
void zr5_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
uint32_t message[18];
#pragma unroll 18
for(int i=0; i<18; i++)
message[i] = c_PaddedMessage[i];
message[1] = nounce;
// Get mid-state
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++)
keccak_gpu_state[i] = c_State[i];
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
uint32_t hash[16];
#pragma unroll 8
for (int i = 0; i < 8; i++) {
U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]);
}
// Output (64 bytes hash required)
uint32_t hashPosition = nounce - startNounce;
//uint32_t *outpHash = (uint32_t*) (&g_hash[hashPosition*8]);
//#pragma unroll 16
//for(int i=0; i<16; i++)
// outpHash[i] = hash[i];
uint4 *outpHash = (uint4*) (&g_hash[hashPosition*8]);
uint4 *psrc = (uint4*) hash;
outpHash[0] = psrc[0];
outpHash[1] = psrc[1];
outpHash[2] = psrc[2];
outpHash[3] = psrc[3];
}
}
__host__
void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
zr5_keccak512_gpu_hash<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, 0, thr_id);
}
/* required for the second hash part of zr5 */
__global__
void zr5_keccak512_gpu_hash_pok(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint16_t *d_pokh, uint32_t version)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
//uint32_t hashPosition = thread * 16;
uint32_t *prevHash = &g_hash[thread * 16]; // thread * 64 / sizeof(uint32_t)
uint32_t message[18]; /* 72 bytes */
// pok - hash[0] from prev hash
message[0] = version | (prevHash[0] & POK_DATA_MASK);
// save pok
d_pokh[thread] = (uint16_t) (message[0] / 0x10000);
for (int i=1; i<18; i++) {
message[i]=d_OriginalData[i];
}
// first bloc
uint64_t keccak_gpu_state[25] = { 0 };
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
// second bloc
message[0] = d_OriginalData[18];
message[1] = nounce; //cuda_swab32(nounce);
message[2] = 1;
#pragma unroll
for(int i=3; i<17; i++)
message[i] = 0;
message[17] = 0x80000000UL;
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
uint32_t hash[16];
#pragma unroll 8
for (size_t i = 0; i < 8; i++) {
U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]);
}
//uint32_t *outpHash = &g_hash[thread * 16];
//#pragma unroll 16
//for(int i=0; i<16; i++)
// outpHash[i] = hash[i];
uint4 *outpHash = (uint4*) (&g_hash[thread * 16]);
uint4 *psrc = (uint4*) hash;
outpHash[0] = psrc[0];
outpHash[1] = psrc[1];
outpHash[2] = psrc[2];
outpHash[3] = psrc[3];
}
}
__host__
void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks)
{
const uint32_t threadsperblock = 256;
const uint32_t version = pdata[0] & (~POK_DATA_MASK);
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
cudaMemcpyToSymbol(d_OriginalData, pdata, sizeof(d_OriginalData), 0, cudaMemcpyHostToDevice);
zr5_keccak512_gpu_hash_pok<<<grid, block>>>(threads, startNounce, d_hash, d_poks, version);
MyStreamSynchronize(NULL, 10, thr_id);
}

2
Makefile.am

@ -41,7 +41,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -41,7 +41,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \
cuda_nist5.cu pentablake.cu \
cuda_nist5.cu pentablake.cu zr5.cu \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \

12
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccMiner release 1.6.0-tpruvot (Mar 2015) - "Pluck & Whirlpoolx"
ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX"
---------------------------------------------------------------
***************************************************************
@ -37,8 +37,9 @@ BlakeCoin (256 8-rounds) @@ -37,8 +37,9 @@ BlakeCoin (256 8-rounds)
Keccak (Maxcoin)
Deep, Doom and Qubit
Pentablake (Blake 512 x5)
S3 (OneCoin)
Lyra2RE (new VertCoin algo)
1Coin Triple S
Vertcoin Lyra2RE
Ziftrcoin (ZR5)
where some of these coins have a VERY NOTABLE nVidia advantage
over competing AMD (OpenCL Only) implementations.
@ -84,6 +85,7 @@ its command line interface and options. @@ -84,6 +85,7 @@ its command line interface and options.
x14 use to mine X14Coin
x15 use to mine Halcyon
x17 use to mine X17
zr5 use to mine ZiftrCoin
-d, --devices gives a comma separated list of CUDA device IDs
to operate on. Device IDs start counting from 0!
@ -183,12 +185,12 @@ features. @@ -183,12 +185,12 @@ features.
>>> RELEASE HISTORY <<<
Mar. 2015 v1.6.0 (Note for CryptoMiningBlog: NOT YET RELEASED/FINISHED!)
Mar. 27th 2015 v1.6.0
Add the ZR5 Algo for Ziftcoin
Import pluck (djm34) and whirlpoolx (alexis78) algos
Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs)
Default config file (also help to debug without command line)
Various small fixes
More to come soon...
Feb. 11th 2015 v1.5.3
Fix anime algo

74
ccminer.cpp

@ -108,6 +108,7 @@ enum sha_algos { @@ -108,6 +108,7 @@ enum sha_algos {
ALGO_X14,
ALGO_X15,
ALGO_X17,
ALGO_ZR5,
};
static const char *algo_names[] = {
@ -140,6 +141,7 @@ static const char *algo_names[] = { @@ -140,6 +141,7 @@ static const char *algo_names[] = {
"x14",
"x15",
"x17",
"zr5",
};
bool opt_debug = false;
@ -166,7 +168,7 @@ static const bool opt_time = true; @@ -166,7 +168,7 @@ static const bool opt_time = true;
static enum sha_algos opt_algo = ALGO_X11;
int opt_n_threads = 0;
int opt_affinity = -1;
int opt_priority = 3;
int opt_priority = 0;
static double opt_difficulty = 1; // CH
bool opt_trust_pool = false;
uint16_t opt_vote = 9999;
@ -193,6 +195,7 @@ int api_thr_id = -1; @@ -193,6 +195,7 @@ int api_thr_id = -1;
bool stratum_need_reset = false;
struct work_restart *work_restart = NULL;
struct stratum_ctx stratum = { 0 };
uint32_t zr5_pok = 0;
pthread_mutex_t applog_lock;
static pthread_mutex_t stats_lock;
@ -254,6 +257,7 @@ Options:\n\ @@ -254,6 +257,7 @@ Options:\n\
x17 X17 (peoplecurrency)\n\
whirl Whirlcoin (old whirlpool)\n\
whirlpoolx Vanilla coin\n\
zr5 ZR5 (ZiftrCoin)\n\
-d, --devices Comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\
string names of your cards like gtx780ti or gt640#2\n\
@ -472,6 +476,10 @@ static bool work_decode(const json_t *val, struct work *work) @@ -472,6 +476,10 @@ static bool work_decode(const json_t *val, struct work *work)
int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target);
int i;
if (opt_algo == ALGO_ZR5) {
data_size = 80; adata_sz = 20;
}
if (unlikely(!jobj_binary(val, "data", work->data, data_size))) {
applog(LOG_ERR, "JSON inval data");
return false;
@ -564,12 +572,12 @@ static int share_result(int result, const char *reason) @@ -564,12 +572,12 @@ static int share_result(int result, const char *reason)
if (reason) {
applog(LOG_WARNING, "reject reason: %s", reason);
if (strncmp(reason, "low difficulty share", 20) == 0) {
if (strncasecmp(reason, "low difficulty", 14) == 0) {
opt_difficulty = (opt_difficulty * 2.0) / 3.0;
applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty);
return 0;
}
if (strncmp(reason, "Duplicate share", 15) == 0 && !check_dups) {
if (strncasecmp(reason, "duplicate", 9) == 0 && !check_dups) {
applog(LOG_WARNING, "enabling duplicates check feature");
check_dups = true;
}
@ -603,7 +611,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -603,7 +611,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
}
}
if (stale_work) {
if (opt_algo == ALGO_ZR5 && !stale_work) {
stale_work = (memcmp(&work->data[1], &g_work.data[1], 68));
}
if (!submit_old && stale_work) {
if (opt_debug)
applog(LOG_WARNING, "stale work detected, discarding");
return true;
@ -616,9 +628,16 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -616,9 +628,16 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
uint16_t nvote;
char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
switch (opt_algo) {
case ALGO_ZR5:
check_dups = true;
be32enc(&ntime, work->data[17]);
be32enc(&nonce, work->data[19]);
break;
default:
le32enc(&ntime, work->data[17]);
le32enc(&nonce, work->data[19]);
}
noncestr = bin2hex((const uchar*)(&nonce), 4);
if (check_dups)
@ -666,14 +685,21 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -666,14 +685,21 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
} else {
int data_size = sizeof(work->data);
int adata_sz = ARRAY_SIZE(work->data);
/* build hex string */
char *str = NULL;
if (opt_algo == ALGO_ZR5) {
data_size = 80; adata_sz = 20;
}
if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
for (int i = 0; i < ARRAY_SIZE(work->data); i++)
for (int i = 0; i < adata_sz; i++)
le32enc(work->data + i, work->data[i]);
}
str = bin2hex((uchar*)work->data, sizeof(work->data));
str = bin2hex((uchar*)work->data, data_size);
if (unlikely(!str)) {
applog(LOG_ERR, "submit_upstream_work OOM");
return false;
@ -1098,10 +1124,18 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1098,10 +1124,18 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
work->data[17] = le32dec(sctx->job.ntime);
work->data[18] = le32dec(sctx->job.nbits);
if (opt_algo == ALGO_MJOLLNIR || opt_algo == ALGO_HEAVY)
{
switch (opt_algo) {
case ALGO_MJOLLNIR:
case ALGO_HEAVY:
// todo: check if 19 is enough
for (i = 0; i < 20; i++)
work->data[i] = be32dec((uint32_t *)&work->data[i]);
break;
case ALGO_ZR5:
for (i = 0; i < 19; i++)
work->data[i] = be32dec((uint32_t *)&work->data[i]);
break;
}
work->data[20] = 0x80000000;
@ -1227,6 +1261,7 @@ static void *miner_thread(void *userdata) @@ -1227,6 +1261,7 @@ static void *miner_thread(void *userdata)
// &work.data[19]
int wcmplen = 76;
int wcmpoft = 0;
uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
if (have_stratum) {
@ -1284,7 +1319,14 @@ static void *miner_thread(void *userdata) @@ -1284,7 +1319,14 @@ static void *miner_thread(void *userdata)
hashlog_purge_job(work.job_id);
}
}
if (memcmp(work.data, g_work.data, wcmplen)) {
if (opt_algo == ALGO_ZR5) {
// ignore pok/version header
wcmpoft = 1;
wcmplen -= 4;
}
if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
#if 0
if (opt_debug) {
for (int n=0; n <= (wcmplen-8); n+=8) {
@ -1510,6 +1552,11 @@ static void *miner_thread(void *userdata) @@ -1510,6 +1552,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
case ALGO_ZR5: {
rc = scanhash_zr5(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
}
default:
/* should never happen */
goto out;
@ -1606,6 +1653,11 @@ static void *miner_thread(void *userdata) @@ -1606,6 +1653,11 @@ static void *miner_thread(void *userdata)
if (rc > 1 && work.data[21]) {
work.data[19] = work.data[21];
work.data[21] = 0;
if (opt_algo == ALGO_ZR5) {
// todo: use + 4..6 index for pok to allow multiple nonces
work.data[0] = work.data[22]; // pok
work.data[22] = 0;
}
if (!submit_work(mythr, &work))
break;
}
@ -1675,10 +1727,10 @@ start: @@ -1675,10 +1727,10 @@ start:
submit_old = soval ? json_is_true(soval) : false;
pthread_mutex_lock(&g_work_lock);
if (work_decode(json_object_get(val, "result"), &g_work)) {
restart_threads();
if (!opt_quiet)
applog(LOG_BLUE, "%s detected new block", short_url);
g_work_time = time(NULL);
restart_threads();
}
pthread_mutex_unlock(&g_work_lock);
json_decref(val);

12
ccminer.vcxproj

@ -105,8 +105,9 @@ @@ -105,8 +105,9 @@
<CInterleavedPTX>false</CInterleavedPTX>
<MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep>
<CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
<Keep>true</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<GenerateLineInfo>true</GenerateLineInfo>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@ -176,8 +177,8 @@ @@ -176,8 +177,8 @@
<CInterleavedPTX>false</CInterleavedPTX>
<MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep>
<CodeGeneration>compute_50,sm_50;</CodeGeneration>
<Keep>true</Keep>
<CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
<AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
<Defines>
</Defines>
@ -221,7 +222,7 @@ @@ -221,7 +222,7 @@
<MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>false</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
<Include>
</Include>
<TargetMachinePlatform>64</TargetMachinePlatform>
@ -355,6 +356,7 @@ @@ -355,6 +356,7 @@
</CudaCompile>
<CudaCompile Include="cuda_nist5.cu">
</CudaCompile>
<CudaCompile Include="zr5.cu" />
<CudaCompile Include="groestl_functions_quad.cu">
<ExcludedFromBuild>true</ExcludedFromBuild>
</CudaCompile>

9
ccminer.vcxproj.filters

@ -379,9 +379,6 @@ @@ -379,9 +379,6 @@
<CudaCompile Include="cuda_groestlcoin.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="bitslice_transformations_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
@ -571,6 +568,12 @@ @@ -571,6 +568,12 @@
<CudaCompile Include="lyra2\lyra2RE.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="zr5.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA\quark</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<Image Include="res\ccminer.ico">

2
configure.ac

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
AC_INIT([ccminer], [1.6-git])
AC_INIT([ccminer], [1.6])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

30
cpuminer-config.h

@ -63,7 +63,7 @@ @@ -63,7 +63,7 @@
#define HAVE_STRING_H 1
/* Define to 1 if you have the <syslog.h> header file. */
#define HAVE_SYSLOG_H 1
/* #undef HAVE_SYSLOG_H */
/* Define to 1 if you have the <sys/endian.h> header file. */
/* #undef HAVE_SYS_ENDIAN_H */
@ -108,7 +108,7 @@ @@ -108,7 +108,7 @@
/* #undef LIBCURL_FEATURE_SSPI */
/* Defined if libcurl supports DICT */
#define LIBCURL_PROTOCOL_DICT 1
/* #undef LIBCURL_PROTOCOL_DICT */
/* Defined if libcurl supports FILE */
#define LIBCURL_PROTOCOL_FILE 1
@ -123,28 +123,28 @@ @@ -123,28 +123,28 @@
#define LIBCURL_PROTOCOL_HTTP 1
/* Defined if libcurl supports HTTPS */
#define LIBCURL_PROTOCOL_HTTPS 1
/* #undef LIBCURL_PROTOCOL_HTTPS */
/* Defined if libcurl supports IMAP */
#define LIBCURL_PROTOCOL_IMAP 1
/* #undef LIBCURL_PROTOCOL_IMAP */
/* Defined if libcurl supports LDAP */
#define LIBCURL_PROTOCOL_LDAP 1
/* #undef LIBCURL_PROTOCOL_LDAP */
/* Defined if libcurl supports POP3 */
#define LIBCURL_PROTOCOL_POP3 1
/* #undef LIBCURL_PROTOCOL_POP3 */
/* Defined if libcurl supports RTSP */
#define LIBCURL_PROTOCOL_RTSP 1
/* #undef LIBCURL_PROTOCOL_RTSP */
/* Defined if libcurl supports SMTP */
#define LIBCURL_PROTOCOL_SMTP 1
/* #undef LIBCURL_PROTOCOL_SMTP */
/* Defined if libcurl supports TELNET */
#define LIBCURL_PROTOCOL_TELNET 1
/* #undef LIBCURL_PROTOCOL_TELNET */
/* Defined if libcurl supports TFTP */
#define LIBCURL_PROTOCOL_TFTP 1
/* #undef LIBCURL_PROTOCOL_TFTP */
/* Define to 1 if your C compiler doesn't accept -c and -o together. */
/* #undef NO_MINUS_C_MINUS_O */
@ -159,16 +159,16 @@ @@ -159,16 +159,16 @@
#define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 1.6-git"
#define PACKAGE_STRING "ccminer 1.6"
/* Define to the one symbol short name of this package. */
#define PACKAGE_TARNAME "ccminer"
/* Define to the home page for this package. */
#define PACKAGE_URL ""
#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.6-git"
#define PACKAGE_VERSION "1.6"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be
@ -191,10 +191,10 @@ @@ -191,10 +191,10 @@
#define USE_XOP 1
/* Version number of package */
#define VERSION "1.6-git"
#define VERSION "1.6"
/* Define curl_free() as free() if our version of curl lacks curl_free. */
/* #undef curl_free */
/* Define to `unsigned int' if <sys/types.h> does not define. */
/* #undef size_t */
//#define size_t unsigned int

5
miner.h

@ -378,6 +378,10 @@ extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata, @@ -378,6 +378,10 @@ extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_zr5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
/* api related */
void *api_thread(void *userdata);
void api_set_throughput(int thr_id, uint32_t throughput);
@ -679,6 +683,7 @@ void x13hash(void *output, const void *input); @@ -679,6 +683,7 @@ void x13hash(void *output, const void *input);
void x14hash(void *output, const void *input);
void x15hash(void *output, const void *input);
void x17hash(void *output, const void *input);
void zr5hash(void *output, const void *input);
#ifdef __cplusplus
}

15
quark/cuda_quark_groestl512.cu

@ -52,11 +52,22 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u @@ -52,11 +52,22 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u
uint32_t hash[16];
from_bitslice_quad(state, hash);
if (thr == 0)
{
// uint4 = 4x4 uint32_t = 16 bytes
if (thr == 0) {
uint4 *phash = (uint4*) hash;
uint4 *outpt = (uint4*) outpHash; /* var kept for hash align */
outpt[0] = phash[0];
outpt[1] = phash[1];
outpt[2] = phash[2];
outpt[3] = phash[3];
}
/*
if (thr == 0) {
#pragma unroll
for(int k=0;k<16;k++) outpHash[k] = hash[k];
}
*/
}
#endif
}

39
util.cpp

@ -1660,12 +1660,35 @@ extern void applog_hash(uchar *hash) @@ -1660,12 +1660,35 @@ extern void applog_hash(uchar *hash)
#define printpfx(n,h) \
printf("%s%11s%s: %s\n", CL_GRN, n, CL_N, format_hash(s, h))
static uint32_t zrtest[20] = {
swab32(0x01806486),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x00000000),
swab32(0x2ab03251),
swab32(0x87d4f28b),
swab32(0x6e22f086),
swab32(0x4845ddd5),
swab32(0x0ac4e6aa),
swab32(0x22a1709f),
swab32(0xfb4275d9),
swab32(0x25f26636),
swab32(0x300eed54),
swab32(0xffff0f1e),
swab32(0x2a9e2300),
};
void do_gpu_tests(void)
{
#ifdef _DEBUG
unsigned long done;
char s[128] = { '\0' };
uchar buf[128];
uchar buf[160];
uint32_t tgt[8] = { 0 };
opt_tracegpu = true;
@ -1674,11 +1697,15 @@ void do_gpu_tests(void) @@ -1674,11 +1697,15 @@ void do_gpu_tests(void)
tgt[7] = 0xffff;
memset(buf, 0, sizeof buf);
scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
//memcpy(buf, zrtest, 80);
scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done);
//memset(buf, 0, sizeof buf);
//scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
memset(buf, 0, sizeof buf);
// buf[0] = 1; buf[64] = 2; // for endian tests
scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
//scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
//memset(buf, 0, sizeof buf);
//scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
@ -1688,6 +1715,7 @@ void do_gpu_tests(void) @@ -1688,6 +1715,7 @@ void do_gpu_tests(void)
opt_tracegpu = false;
#endif
}
extern "C" void zr5hash_pok(void *output, uint32_t *pdata);
void print_hash_tests(void)
{
@ -1782,6 +1810,11 @@ void print_hash_tests(void) @@ -1782,6 +1810,11 @@ void print_hash_tests(void)
x17hash(&hash[0], &buf[0]);
printpfx("X17", hash);
//memcpy(buf, zrtest, 80);
zr5hash(&hash[0], &buf[0]);
//zr5hash_pok(&hash[0], (uint32_t*) &buf[0]);
printpfx("ZR5", hash);
printf("\n");
do_gpu_tests();

342
zr5.cu

@ -0,0 +1,342 @@ @@ -0,0 +1,342 @@
/* Ziftrcoin ZR5 CUDA Implementation, (c) tpruvot 2015 */
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include <stdio.h>
#include <memory.h>
#define ZR_BLAKE 0
#define ZR_GROESTL 1
#define ZR_JH512 2
#define ZR_SKEIN 3
#define POK_BOOL_MASK 0x00008000
#define POK_DATA_MASK 0xFFFF0000
static uint32_t* d_hash[MAX_GPUS];
static uint16_t* d_pokh[MAX_GPUS];
static uint16_t* h_poks[MAX_GPUS];
static uint32_t* d_blake[MAX_GPUS];
static uint32_t* d_groes[MAX_GPUS];
static uint32_t* d_jh512[MAX_GPUS];
static uint32_t* d_skein[MAX_GPUS];
__constant__ uint8_t d_permut[24][4];
static const uint8_t permut[24][4] = {
{0, 1, 2, 3},
{0, 1, 3, 2},
{0, 2, 1, 3},
{0, 2, 3, 1},
{0, 3, 1, 2},
{0, 3, 2, 1},
{1, 0, 2, 3},
{1, 0, 3, 2},
{1, 2, 0, 3},
{1, 2, 3, 0},
{1, 3, 0, 2},
{1, 3, 2, 0},
{2, 0, 1, 3},
{2, 0, 3, 1},
{2, 1, 0, 3},
{2, 1, 3, 0},
{2, 3, 0, 1},
{2, 3, 1, 0},
{3, 0, 1, 2},
{3, 0, 2, 1},
{3, 1, 0, 2},
{3, 1, 2, 0},
{3, 2, 0, 1},
{3, 2, 1, 0}
};
// CPU HASH
extern "C" void zr5hash(void *output, const void *input)
{
sph_keccak512_context ctx_keccak;
sph_blake512_context ctx_blake;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_skein512_context ctx_skein;
uchar _ALIGN(64) hash[64];
uint32_t *phash = (uint32_t *) hash;
uint32_t norder;
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, (const void*) input, 80);
sph_keccak512_close(&ctx_keccak, (void*) phash);
norder = phash[0] % ARRAY_SIZE(permut); /* % 24 */
for(int i = 0; i < 4; i++)
{
switch (permut[norder][i]) {
case ZR_BLAKE:
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, (const void*) phash, 64);
sph_blake512_close(&ctx_blake, phash);
break;
case ZR_GROESTL:
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, (const void*) phash, 64);
sph_groestl512_close(&ctx_groestl, phash);
break;
case ZR_JH512:
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, (const void*) phash, 64);
sph_jh512_close(&ctx_jh, phash);
break;
case ZR_SKEIN:
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, (const void*) phash, 64);
sph_skein512_close(&ctx_skein, phash);
break;
default:
break;
}
}
memcpy(output, phash, 32);
}
extern "C" void zr5hash_pok(void *output, uint32_t *pdata)
{
const uint32_t version = pdata[0] & (~POK_DATA_MASK);
uint32_t _ALIGN(64) hash[8];
pdata[0] = version;
zr5hash(hash, pdata);
// fill PoK
pdata[0] = version | (hash[0] & POK_DATA_MASK);
zr5hash(hash, pdata);
memcpy(output, hash, 32);
}
__global__
void zr5_copy_round_data_gpu(uint32_t threads, uint32_t *d_hash, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, int rnd)
{
// copy 64 bytes hash in the right algo buffer
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint64_t offset = thread * 64 / 4;
uint32_t *phash = &d_hash[offset];
// algos hash order
uint32_t norder = phash[0] % ARRAY_SIZE(permut);
uint32_t algo = d_permut[norder][rnd];
uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
if (rnd > 0) {
int algosrc = d_permut[norder][rnd - 1];
phash = buffers[algosrc] + offset;
}
// uint4 = 4x4 uint32_t = 16 bytes
uint4 *psrc = (uint4*) phash;
uint4 *pdst = (uint4*) (buffers[algo] + offset);
pdst[0] = psrc[0];
pdst[1] = psrc[1];
pdst[2] = psrc[2];
pdst[3] = psrc[3];
}
}
__host__
void zr5_move_data_to_hash(int thr_id, uint32_t threads, int rnd)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
zr5_copy_round_data_gpu <<<grid, block>>> (threads, d_hash[thr_id], d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], rnd);
}
__global__
void zr5_final_round_data_gpu(uint32_t threads, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, uint32_t *d_hash, uint16_t *d_pokh)
{
// after the 4 algos rounds, copy back hash to d_hash
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint64_t offset = thread * 16; // 64 / 4;
uint32_t *phash = &d_hash[offset];
uint16_t norder = phash[0] % ARRAY_SIZE(permut);
uint16_t algosrc = d_permut[norder][3];
uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
// copy only hash[0] + hash[6..7]
uint2 *psrc = (uint2*) (buffers[algosrc] + offset);
uint2 *pdst = (uint2*) phash;
pdst[0].x = psrc[0].x;
pdst[3] = psrc[3];
//phash[7] = *(buffers[algosrc] + offset + 7);
}
}
__host__
void zr5_final_round(int thr_id, uint32_t threads)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
zr5_final_round_data_gpu <<<grid, block>>> (threads, d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], d_hash[thr_id], d_pokh[thr_id]);
}
extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
extern void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks);
extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(64) tmpdata[20];
const uint32_t version = pdata[0] & (~POK_DATA_MASK);
const uint32_t first_nonce = pdata[19];
uint32_t throughput = device_intensity(thr_id, __func__, 1U << 18);
throughput = min(throughput, (1U << 20)-1024);
throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
memcpy(tmpdata, pdata, 80);
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
// hash buffer = keccak hash 64 required
cudaMalloc(&d_hash[thr_id], 64 * throughput);
cudaMalloc(&d_pokh[thr_id], 2 * throughput);
cudaMemcpyToSymbol(d_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);
cudaMallocHost(&h_poks[thr_id], 2 * throughput);
// data buffers for the 4 rounds
cudaMalloc(&d_blake[thr_id], 64 * throughput);
cudaMalloc(&d_groes[thr_id], 64 * throughput);
cudaMalloc(&d_jh512[thr_id], 64 * throughput);
cudaMalloc(&d_skein[thr_id], 64 * throughput);
jackpot_keccak512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
cuda_check_cpu_init(thr_id, throughput);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
init[thr_id] = true;
}
tmpdata[0] = version;
jackpot_keccak512_cpu_setBlock((void*)tmpdata, 80);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
// Keccak512 Hash with CUDA
zr5_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
for (int rnd=0; rnd<4; rnd++) {
zr5_move_data_to_hash(thr_id, throughput, rnd);
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
}
// This generates all pok prefixes
zr5_final_round(thr_id, throughput);
// Keccak512 pok
zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_pokh[thr_id]);
for (int rnd=0; rnd<4; rnd++) {
zr5_move_data_to_hash(thr_id, throughput, rnd);
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
}
zr5_final_round(thr_id, throughput);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (foundNonce != UINT32_MAX)
{
uint32_t vhash64[8];
uint32_t oldp0 = pdata[0];
uint32_t oldp19 = pdata[19];
uint32_t offset = foundNonce - pdata[19];
uint32_t pok = 0;
*hashes_done = pdata[19] - first_nonce + throughput;
cudaMemcpy(h_poks[thr_id], d_pokh[thr_id], 2 * throughput, cudaMemcpyDeviceToHost);
pok = version | (0x10000UL * h_poks[thr_id][offset]);
pdata[0] = pok; pdata[19] = foundNonce;
zr5hash(vhash64, pdata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
int res = 1;
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
if (secNonce != 0) {
offset = secNonce - oldp19;
pok = version | (0x10000UL * h_poks[thr_id][offset]);
memcpy(tmpdata, pdata, 80);
tmpdata[0] = pok; tmpdata[19] = secNonce;
zr5hash(vhash64, tmpdata);
if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
pdata[21] = secNonce;
pdata[22] = pok;
res++;
}
}
return res;
} else {
applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
pdata[19]++;
pdata[0] = oldp0;
}
} else
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
Loading…
Cancel
Save