mirror of
https://github.com/GOSTSec/ccminer
synced 2025-03-10 04:21:12 +00:00
handle new cryptonight variants, stellite, aeon
special thanks for klausT changes and ystarnaud who helped me to adapt my kernel variants... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
This commit is contained in:
parent
968d2ba049
commit
77c4b8724e
31
algos.h
31
algos.h
@ -72,6 +72,9 @@ enum sha_algos {
|
||||
ALGO_WHIRLPOOLX,
|
||||
ALGO_WILDKECCAK,
|
||||
ALGO_ZR5,
|
||||
ALGO_MONERO,
|
||||
ALGO_GRAFT,
|
||||
ALGO_STELLITE,
|
||||
ALGO_AUTO,
|
||||
ALGO_COUNT
|
||||
};
|
||||
@ -146,6 +149,9 @@ static const char *algo_names[] = {
|
||||
"whirlpoolx",
|
||||
"wildkeccak",
|
||||
"zr5",
|
||||
"monero",
|
||||
"graft",
|
||||
"stellite",
|
||||
"auto", /* reserved for multi algo */
|
||||
""
|
||||
};
|
||||
@ -206,4 +212,29 @@ static inline int algo_to_int(char* arg)
|
||||
return i;
|
||||
}
|
||||
|
||||
static inline int get_cryptonight_algo(int fork)
|
||||
{
|
||||
int algo = ALGO_COUNT;
|
||||
|
||||
switch (fork) {
|
||||
case 8:
|
||||
algo = ALGO_GRAFT;
|
||||
break;
|
||||
|
||||
case 7:
|
||||
algo = ALGO_MONERO;
|
||||
break;
|
||||
|
||||
case 3:
|
||||
algo = ALGO_STELLITE;
|
||||
break;
|
||||
|
||||
default:
|
||||
algo = ALGO_CRYPTONIGHT;
|
||||
break;
|
||||
}
|
||||
|
||||
return algo;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
41
ccminer.cpp
41
ccminer.cpp
@ -233,6 +233,8 @@ int opt_api_mcast_port = 4068;
|
||||
|
||||
bool opt_stratum_stats = false;
|
||||
|
||||
int cryptonight_fork = 1;
|
||||
|
||||
static char const usage[] = "\
|
||||
Usage: " PROGRAM_NAME " [OPTIONS]\n\
|
||||
Options:\n\
|
||||
@ -245,7 +247,7 @@ Options:\n\
|
||||
blakecoin Fast Blake 256 (8 rounds)\n\
|
||||
bmw BMW 256\n\
|
||||
cryptolight AEON cryptonight (MEM/2)\n\
|
||||
cryptonight XMR cryptonight\n\
|
||||
cryptonight XMR cryptonight v1 (old)\n\
|
||||
c11/flax X11 variant\n\
|
||||
decred Decred Blake256\n\
|
||||
deep Deepcoin\n\
|
||||
@ -253,6 +255,7 @@ Options:\n\
|
||||
dmd-gr Diamond-Groestl\n\
|
||||
fresh Freshcoin (shavite 80)\n\
|
||||
fugue256 Fuguecoin\n\
|
||||
graft Cryptonight v8\n\
|
||||
groestl Groestlcoin\n"
|
||||
#ifdef WITH_HEAVY_ALGO
|
||||
" heavy Heavycoin\n"
|
||||
@ -267,6 +270,7 @@ Options:\n\
|
||||
lyra2v2 VertCoin\n\
|
||||
lyra2z ZeroCoin (3rd impl)\n\
|
||||
myr-gr Myriad-Groestl\n\
|
||||
monero XMR cryptonight (v7)\n\
|
||||
neoscrypt FeatherCoin, Phoenix, UFO...\n\
|
||||
nist5 NIST5 (TalkCoin)\n\
|
||||
penta Pentablake hash (5x Blake 512)\n\
|
||||
@ -284,6 +288,7 @@ Options:\n\
|
||||
skein Skein SHA2 (Skeincoin)\n\
|
||||
skein2 Double Skein (Woodcoin)\n\
|
||||
skunk Skein Cube Fugue Streebog\n\
|
||||
stellite Cryptonight v3\n\
|
||||
s3 S3 (1Coin)\n\
|
||||
timetravel Machinecoin permuted x8\n\
|
||||
tribus Denarius\n\
|
||||
@ -573,7 +578,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work);
|
||||
|
||||
void get_currentalgo(char* buf, int sz)
|
||||
{
|
||||
snprintf(buf, sz, "%s", algo_names[opt_algo]);
|
||||
int algo = opt_algo;
|
||||
if (algo == ALGO_CRYPTONIGHT)
|
||||
algo = get_cryptonight_algo(cryptonight_fork);
|
||||
snprintf(buf, sz, "%s", algo_names[algo]);
|
||||
}
|
||||
|
||||
void format_hashrate(double hashrate, char *output)
|
||||
@ -2372,11 +2380,16 @@ static void *miner_thread(void *userdata)
|
||||
rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
|
||||
break;
|
||||
case ALGO_CRYPTOLIGHT:
|
||||
rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);
|
||||
rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
|
||||
break;
|
||||
case ALGO_CRYPTONIGHT:
|
||||
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
|
||||
{
|
||||
int cn_variant = 0;
|
||||
if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork)
|
||||
cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1;
|
||||
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant);
|
||||
break;
|
||||
}
|
||||
case ALGO_DECRED:
|
||||
rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
|
||||
break;
|
||||
@ -3138,6 +3151,26 @@ void parse_arg(int key, char *arg)
|
||||
case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
|
||||
}
|
||||
}
|
||||
|
||||
// cryptonight variants
|
||||
switch (opt_algo) {
|
||||
case ALGO_MONERO:
|
||||
opt_algo = ALGO_CRYPTONIGHT;
|
||||
cryptonight_fork = 7;
|
||||
break;
|
||||
case ALGO_GRAFT:
|
||||
opt_algo = ALGO_CRYPTONIGHT;
|
||||
cryptonight_fork = 8;
|
||||
break;
|
||||
case ALGO_STELLITE:
|
||||
opt_algo = ALGO_CRYPTONIGHT;
|
||||
cryptonight_fork = 3;
|
||||
break;
|
||||
case ALGO_CRYPTONIGHT:
|
||||
cryptonight_fork = 1;
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
case 'b':
|
||||
p = strstr(arg, ":");
|
||||
|
@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
|
||||
*/
|
||||
|
||||
#define AS_U32(addr) *((uint32_t*)(addr))
|
||||
#define AS_U64(addr) *((uint64_t*)(addr))
|
||||
#define AS_UINT2(addr) *((uint2*)(addr))
|
||||
#define AS_UINT4(addr) *((uint4*)(addr))
|
||||
#define AS_UL2(addr) *((ulonglong2*)(addr))
|
||||
|
@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
|
||||
}
|
||||
|
||||
__device__
|
||||
void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
|
||||
void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out)
|
||||
{
|
||||
blake_state bs;
|
||||
blake_state *S = (blake_state *)&bs;
|
||||
|
@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri
|
||||
for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
|
||||
output[j] = s[i];
|
||||
}
|
||||
|
||||
#if 0
|
||||
for (i = 0; i < GROESTL_COLS512; i++) {
|
||||
ctx->chaining[i] = 0;
|
||||
}
|
||||
for (i = 0; i < GROESTL_SIZE512; i++) {
|
||||
ctx->buffer[i] = 0;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__
|
||||
@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx)
|
||||
}
|
||||
|
||||
__device__
|
||||
void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
|
||||
void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
|
||||
{
|
||||
DataLength databitlen = len << 3;
|
||||
groestlHashState context;
|
||||
|
||||
cn_groestl_init(&context);
|
||||
cn_groestl_update(&context, data, databitlen);
|
||||
cn_groestl_final(&context, hashval);
|
||||
cn_groestl_update(&context, (BitSequence*) data, databitlen);
|
||||
cn_groestl_final(&context, (BitSequence*) hashval);
|
||||
}
|
||||
|
@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
|
||||
databitlen = 0;
|
||||
}
|
||||
|
||||
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) {
|
||||
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
|
||||
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) )
|
||||
{
|
||||
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
|
||||
index = 64-(state->datasize_in_buffer >> 3);
|
||||
databitlen = databitlen - (512 - state->datasize_in_buffer);
|
||||
cn_jh_F8(state);
|
||||
@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
|
||||
|
||||
/* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
|
||||
__device__
|
||||
void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval)
|
||||
void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval)
|
||||
{
|
||||
unsigned int i;
|
||||
//uint32_t *bufptr = (uint32_t *)state->buffer;
|
||||
@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
|
||||
|
||||
} else {
|
||||
|
||||
/*set the rest of the bytes in the buffer to 0*/
|
||||
/* set the rest of the bytes in the buffer to 0 */
|
||||
if ( (state->datasize_in_buffer & 7) == 0) {
|
||||
for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
|
||||
} else {
|
||||
@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
|
||||
cn_jh_F8(state);
|
||||
}
|
||||
|
||||
MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
|
||||
memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32);
|
||||
//MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
|
||||
}
|
||||
|
||||
__device__
|
||||
@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen)
|
||||
state->databitlen = 0;
|
||||
state->datasize_in_buffer = 0;
|
||||
state->hashbitlen = hashbitlen;
|
||||
//memcpy(state->x, d_JH256_H0, 128);
|
||||
MEMCPY8(state->x, d_JH256_H0, 128 / 8);
|
||||
memcpy(state->x, d_JH256_H0, 128);
|
||||
//MEMCPY8(state->x, d_JH256_H0, 128 / 8);
|
||||
}
|
||||
|
||||
__device__
|
||||
void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
|
||||
void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
|
||||
{
|
||||
const int hashbitlen = 256;
|
||||
DataLength databitlen = len << 3;
|
||||
@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
|
||||
|
||||
cn_jh_init(&state, hashbitlen);
|
||||
cn_jh_update(&state, data, databitlen);
|
||||
cn_jh_final(&state, hashval);
|
||||
cn_jh_final(&state, (uint8_t*) hashval);
|
||||
}
|
||||
|
@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s)
|
||||
}
|
||||
|
||||
__device__ __forceinline__
|
||||
void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md)
|
||||
void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md)
|
||||
{
|
||||
uint64_t st[25];
|
||||
|
||||
|
@ -4,19 +4,15 @@ typedef unsigned int uint_t; /* native unsigned integer */
|
||||
|
||||
#define SKEIN_256_STATE_WORDS ( 4)
|
||||
#define SKEIN_512_STATE_WORDS ( 8)
|
||||
#define SKEIN1024_STATE_WORDS (16)
|
||||
|
||||
#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
|
||||
#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
|
||||
#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
|
||||
|
||||
#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS)
|
||||
#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
|
||||
#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS)
|
||||
|
||||
#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
|
||||
#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
|
||||
#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
|
||||
|
||||
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
|
||||
#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
|
||||
@ -119,7 +115,7 @@ typedef struct {
|
||||
} skeinHashState;
|
||||
|
||||
__device__
|
||||
void cn_skein256_init(skeinHashState *state, size_t hashBitLen)
|
||||
void cn_skein_init(skeinHashState *state, size_t hashBitLen)
|
||||
{
|
||||
const uint64_t SKEIN_512_IV_256[] =
|
||||
{
|
||||
@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr
|
||||
}
|
||||
|
||||
__device__
|
||||
void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
|
||||
void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
|
||||
{
|
||||
if ((databitlen & 7) == 0) {
|
||||
|
||||
cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
|
||||
}
|
||||
else {
|
||||
|
||||
size_t bCnt = (databitlen >> 3) + 1;
|
||||
uint8_t b,mask;
|
||||
|
||||
@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r
|
||||
}
|
||||
|
||||
__device__
|
||||
void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal)
|
||||
void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
|
||||
{
|
||||
uint64_t X[SKEIN_512_STATE_WORDS];
|
||||
Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
|
||||
@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric
|
||||
((uint64_t *)ctx->b)[0] = (uint64_t)i;
|
||||
Skein_Start_New_Type(ctx, OUT_FINAL);
|
||||
cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
|
||||
memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n);
|
||||
memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n);
|
||||
memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
|
||||
void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
|
||||
{
|
||||
int hashbitlen = 256;
|
||||
DataLength databitlen = len << 3;
|
||||
@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
|
||||
|
||||
state.statebits = 64*SKEIN_512_STATE_WORDS;
|
||||
|
||||
cn_skein256_init(&state, hashbitlen);
|
||||
cn_skein256_update(&state, data, databitlen);
|
||||
cn_skein256_final(&state, hashval);
|
||||
cn_skein_init(&state, hashbitlen);
|
||||
cn_skein_update(&state, data, databitlen);
|
||||
cn_skein_final(&state, (uint8_t*) hashval);
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
|
||||
|
||||
if(thread < threads)
|
||||
{
|
||||
const int oft = thread * 52 + sub + 16; // not aligned 16!
|
||||
const int oft = thread * 50 + sub + 16; // not aligned 16!
|
||||
const int long_oft = (thread << LONG_SHL_IDX) + sub;
|
||||
uint32_t __align__(16) key[40];
|
||||
uint32_t __align__(16) text[4];
|
||||
@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
__global__
|
||||
void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
|
||||
void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
|
||||
{
|
||||
__shared__ uint32_t __align__(16) sharedMemory[1024];
|
||||
|
||||
@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
|
||||
#endif // __CUDA_ARCH__ >= 300
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void store_variant1(uint32_t* long_state)
|
||||
{
|
||||
uint4* Z = (uint4*) long_state;
|
||||
const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
|
||||
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
|
||||
Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
|
||||
}
|
||||
|
||||
#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
|
||||
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
|
||||
hi += ((uint64_t *)c)[0]; \
|
||||
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
|
||||
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
|
||||
((uint64_t *)dst)[0] = hi; \
|
||||
((uint64_t *)dst)[1] = lo ^ tweak; }
|
||||
|
||||
__global__
|
||||
void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
|
||||
uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
|
||||
uint64_t * __restrict__ d_tweak)
|
||||
{
|
||||
__shared__ __align__(16) uint32_t sharedMemory[1024];
|
||||
cn_aes_gpu_init(sharedMemory);
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (thread < threads)
|
||||
{
|
||||
const uint32_t batchsize = ITER >> (2 + bfactor);
|
||||
const uint32_t start = partidx * batchsize;
|
||||
const uint32_t end = start + batchsize;
|
||||
const uint32_t longptr = thread << LONG_SHL_IDX;
|
||||
uint32_t * long_state = &d_long_state[longptr];
|
||||
uint64_t tweak = d_tweak[thread];
|
||||
|
||||
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
|
||||
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
|
||||
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
|
||||
uint4 B = AS_UINT4(ctx_b);
|
||||
uint32_t* a = (uint32_t*)&A;
|
||||
uint32_t* b = (uint32_t*)&B;
|
||||
|
||||
for (int i = start; i < end; i++)
|
||||
{
|
||||
uint32_t c[4];
|
||||
uint32_t j = (A.x >> 2) & E2I_MASK2;
|
||||
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
|
||||
XOR_BLOCKS_DST(c, b, &long_state[j]);
|
||||
store_variant1(&long_state[j]);
|
||||
MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
|
||||
|
||||
j = (A.x >> 2) & E2I_MASK2;
|
||||
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
|
||||
XOR_BLOCKS_DST(b, c, &long_state[j]);
|
||||
store_variant1(&long_state[j]);
|
||||
MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
|
||||
}
|
||||
if (bfactor) {
|
||||
AS_UINT4(ctx_a) = A;
|
||||
AS_UINT4(ctx_b) = B;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__
|
||||
void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
|
||||
{
|
||||
@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
|
||||
if(thread < threads)
|
||||
{
|
||||
const int long_oft = (thread << LONG_SHL_IDX) + sub;
|
||||
const int oft = thread * 52 + sub + 16;
|
||||
const int oft = thread * 50 + sub + 16;
|
||||
uint32_t __align__(16) key[40];
|
||||
uint32_t __align__(16) text[4];
|
||||
|
||||
@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
|
||||
extern int device_bfactor[MAX_GPUS];
|
||||
|
||||
__host__
|
||||
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state,
|
||||
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
|
||||
void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
|
||||
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
|
||||
{
|
||||
dim3 grid(blocks);
|
||||
dim3 block(threads);
|
||||
@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
|
||||
int i, partcount = 1 << bfactor;
|
||||
int dev_id = device_map[thr_id];
|
||||
|
||||
cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1);
|
||||
cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
if(partcount > 1) usleep(bsleep);
|
||||
|
||||
for(i = 0; i < partcount; i++)
|
||||
{
|
||||
cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
|
||||
dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
|
||||
if (variant == 0)
|
||||
cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
|
||||
else
|
||||
cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
if(partcount > 1) usleep(bsleep);
|
||||
}
|
||||
|
||||
cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2);
|
||||
cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
@ -22,6 +22,16 @@ struct cryptonight_ctx {
|
||||
oaes_ctx* aes_ctx;
|
||||
};
|
||||
|
||||
|
||||
static void cryptolight_store_variant(void* state, int variant) {
|
||||
if (variant == 1) {
|
||||
// use variant 1 like monero since june 2018
|
||||
const uint8_t tmp = ((const uint8_t*)(state))[11];
|
||||
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
|
||||
((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
|
||||
}
|
||||
}
|
||||
|
||||
static void do_blake_hash(const void* input, int len, void* output)
|
||||
{
|
||||
uchar hash[32];
|
||||
@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
|
||||
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
|
||||
}
|
||||
|
||||
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
|
||||
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
|
||||
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
|
||||
hi += ((uint64_t*) c)[0];
|
||||
|
||||
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
|
||||
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
|
||||
((uint64_t*) dst)[0] = hi;
|
||||
((uint64_t*) dst)[1] = lo;
|
||||
((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
|
||||
}
|
||||
|
||||
static void copy_block(uint8_t* dst, const uint8_t* src) {
|
||||
@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
|
||||
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
|
||||
}
|
||||
|
||||
static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx)
|
||||
static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
|
||||
{
|
||||
size_t i, j;
|
||||
if (variant && len < 43)
|
||||
return 0;
|
||||
|
||||
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
|
||||
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||
|
||||
const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
|
||||
|
||||
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
#undef RND
|
||||
@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
|
||||
j = e2i(ctx->a);
|
||||
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
|
||||
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
|
||||
cryptolight_store_variant(&ctx->long_state[j], variant);
|
||||
|
||||
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
|
||||
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
|
||||
|
||||
j = e2i(ctx->a);
|
||||
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
|
||||
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
|
||||
cryptolight_store_variant(&ctx->long_state[j], variant);
|
||||
|
||||
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
|
||||
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
|
||||
}
|
||||
|
||||
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||
@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
|
||||
if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
|
||||
|
||||
oaes_free((OAES_CTX **) &ctx->aes_ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
void cryptolight_hash(void* output, const void* input, int len)
|
||||
int cryptolight_hash_variant(void* output, const void* input, int len, int variant)
|
||||
{
|
||||
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
|
||||
cryptolight_hash_ctx(output, input, len, ctx);
|
||||
int rc = cryptolight_hash_ctx(output, input, len, ctx, variant);
|
||||
free(ctx);
|
||||
return rc;
|
||||
}
|
||||
|
||||
void cryptolight_hash(void* output, const void* input)
|
||||
{
|
||||
cryptolight_hash_variant(output, input, 76, 1);
|
||||
}
|
||||
|
||||
|
@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks = 32;
|
||||
static __thread uint32_t cn_threads = 16;
|
||||
|
||||
static uint32_t *d_long_state[MAX_GPUS];
|
||||
static uint64_t *d_ctx_state[MAX_GPUS];
|
||||
static uint32_t *d_ctx_state[MAX_GPUS];
|
||||
static uint32_t *d_ctx_key1[MAX_GPUS];
|
||||
static uint32_t *d_ctx_key2[MAX_GPUS];
|
||||
static uint32_t *d_ctx_text[MAX_GPUS];
|
||||
static uint64_t *d_ctx_tweak[MAX_GPUS];
|
||||
static uint32_t *d_ctx_a[MAX_GPUS];
|
||||
static uint32_t *d_ctx_b[MAX_GPUS];
|
||||
|
||||
static bool init[MAX_GPUS] = { 0 };
|
||||
|
||||
extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||
extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
|
||||
{
|
||||
int res = 0;
|
||||
uint32_t throughput = 0;
|
||||
@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
|
||||
uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
|
||||
const uint32_t first_nonce = *nonceptr;
|
||||
uint32_t nonce = first_nonce;
|
||||
int dev_id = device_map[thr_id];
|
||||
|
||||
if(opt_benchmark) {
|
||||
ptarget[7] = 0x00ff;
|
||||
@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
|
||||
|
||||
if(!init[thr_id])
|
||||
{
|
||||
if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
|
||||
device_config[thr_id] = strdup("80x32");
|
||||
}
|
||||
|
||||
if (device_config[thr_id]) {
|
||||
sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
|
||||
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
|
||||
@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
|
||||
}
|
||||
|
||||
const size_t alloc = MEMORY * throughput;
|
||||
cryptonight_extra_cpu_init(thr_id, throughput);
|
||||
cryptonight_extra_init(thr_id);
|
||||
|
||||
cudaMalloc(&d_long_state[thr_id], alloc);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
|
||||
cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
|
||||
|
||||
init[thr_id] = true;
|
||||
}
|
||||
@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
|
||||
|
||||
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
|
||||
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
|
||||
cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
|
||||
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
|
||||
cryptonight_extra_setData(thr_id, pdata, ptarget);
|
||||
cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
|
||||
cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
|
||||
cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
|
||||
|
||||
*hashes_done = nonce - first_nonce + throughput;
|
||||
|
||||
@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
|
||||
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
|
||||
memcpy(tempdata, pdata, 76);
|
||||
*tempnonceptr = resNonces[0];
|
||||
cryptolight_hash(vhash, tempdata, 76);
|
||||
cryptolight_hash_variant(vhash, tempdata, 76, variant);
|
||||
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||
{
|
||||
res = 1;
|
||||
@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
|
||||
if(resNonces[1] != UINT32_MAX)
|
||||
{
|
||||
*tempnonceptr = resNonces[1];
|
||||
cryptolight_hash(vhash, tempdata, 76);
|
||||
cryptolight_hash_variant(vhash, tempdata, 76, variant);
|
||||
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
||||
res++;
|
||||
work->nonces[1] = resNonces[1];
|
||||
@ -157,10 +164,11 @@ void free_cryptolight(int thr_id)
|
||||
cudaFree(d_ctx_key1[thr_id]);
|
||||
cudaFree(d_ctx_key2[thr_id]);
|
||||
cudaFree(d_ctx_text[thr_id]);
|
||||
cudaFree(d_ctx_tweak[thr_id]);
|
||||
cudaFree(d_ctx_a[thr_id]);
|
||||
cudaFree(d_ctx_b[thr_id]);
|
||||
|
||||
cryptonight_extra_cpu_free(thr_id);
|
||||
cryptonight_extra_free(thr_id);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
|
@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
|
||||
|
||||
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
|
||||
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
|
||||
void cryptonight_extra_cpu_free(int thr_id);
|
||||
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
|
||||
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
|
||||
void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
|
||||
|
||||
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
|
||||
void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
|
||||
void cryptonight_extra_free(int thr_id);
|
||||
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
|
||||
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);
|
||||
|
@ -2,47 +2,55 @@
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <sys/time.h>
|
||||
#ifndef _WIN32
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
|
||||
#undef __shfl
|
||||
#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
|
||||
#endif
|
||||
|
||||
#include "cryptonight.h"
|
||||
|
||||
#define LONG_SHL32 19 // 1<<19
|
||||
#define LONG_SHL32 19 // 1<<19 (uint32_t* index)
|
||||
#define LONG_SHL64 18 // 1<<18 (uint64_t* index)
|
||||
#define LONG_LOOPS32 0x80000U
|
||||
#define LONG_LOOPS64 0x40000U
|
||||
|
||||
#include "cn_aes.cuh"
|
||||
|
||||
__global__
|
||||
//__launch_bounds__(128, 9) // 56 registers
|
||||
void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1)
|
||||
void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state,
|
||||
uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
|
||||
{
|
||||
__shared__ __align__(16) uint32_t sharedMemory[1024];
|
||||
cn_aes_gpu_init(sharedMemory);
|
||||
__syncthreads();
|
||||
__shared__ uint32_t sharedMemory[1024];
|
||||
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
|
||||
const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14
|
||||
|
||||
if(thread < threads)
|
||||
{
|
||||
const uint32_t long_oft = (thread << LONG_SHL64) + sub;
|
||||
cn_aes_gpu_init(sharedMemory);
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t* ctx_key = &ctx_key1[thread * 40U];
|
||||
uint4 keys[10];
|
||||
#pragma unroll 10 // load 160 bytes
|
||||
for (int i = 0; i < 10; i ++)
|
||||
keys[i] = AS_UINT4(&ctx_key[i*4]);
|
||||
const uint32_t sub = (threadIdx.x & 0x7U) << 2;
|
||||
uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
|
||||
uint32_t __align__(8) key[40];
|
||||
MEMCPY8(key, &ctx_key1[thread * 40U], 20);
|
||||
uint32_t __align__(8) text[4];
|
||||
MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2);
|
||||
|
||||
uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]);
|
||||
|
||||
for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) {
|
||||
cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys);
|
||||
AS_UINT4(&long_state[long_oft + i]) = text;
|
||||
for(int i = 0; i < LONG_LOOPS32; i += 32)
|
||||
{
|
||||
cn_aes_pseudo_round_mut(sharedMemory, text, key);
|
||||
MEMCPY8(&longstate[i], text, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
__device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
|
||||
{
|
||||
ulonglong2 product;
|
||||
@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co
|
||||
return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
|
||||
}
|
||||
|
||||
#undef MUL_SUM_XOR_DST
|
||||
__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst)
|
||||
__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst)
|
||||
{
|
||||
ulonglong2 d = AS_UL2(far_dst);
|
||||
ulonglong2 p = cuda_mul128(m, d.x);
|
||||
@ -73,8 +80,8 @@ __global__
|
||||
#if __CUDA_ARCH__ >= 500
|
||||
//__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
|
||||
#endif
|
||||
void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
|
||||
uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
|
||||
void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
|
||||
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
|
||||
{
|
||||
__shared__ __align__(16) uint32_t sharedMemory[1024];
|
||||
cn_aes_gpu_init(sharedMemory);
|
||||
@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
|
||||
|
||||
if (thread < threads)
|
||||
{
|
||||
const uint32_t batchsize = ITER >> (2U + bfactor);
|
||||
const uint32_t batchsize = ITER >> (2 + bfactor);
|
||||
const uint32_t start = partidx * batchsize;
|
||||
const uint32_t end = start + batchsize;
|
||||
|
||||
@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
|
||||
uint32_t j = (A.x & E2I_MASK) >> 3;
|
||||
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
|
||||
AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4
|
||||
MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
|
||||
MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
|
||||
|
||||
j = (A.x & E2I_MASK) >> 3;
|
||||
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
|
||||
AS_UINT4(&long_state[j]) = C ^ B;
|
||||
MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
|
||||
MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
|
||||
}
|
||||
|
||||
if (bfactor) {
|
||||
@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
|
||||
{
|
||||
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
|
||||
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
|
||||
Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
|
||||
AS_UINT4(long_state) = Z;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z)
|
||||
{
|
||||
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
|
||||
const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1;
|
||||
Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24);
|
||||
AS_UINT4(long_state) = Z;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak)
|
||||
{
|
||||
ulonglong2 d = AS_UL2(far_dst);
|
||||
ulonglong2 p = cuda_mul128(m, d.x);
|
||||
p += AS_UL2(&a);
|
||||
AS_UL2(&a) = p ^ d;
|
||||
p.y = p.y ^ tweak;
|
||||
AS_UL2(far_dst) = p;
|
||||
}
|
||||
|
||||
__global__
|
||||
void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2)
|
||||
void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
|
||||
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
|
||||
uint64_t * __restrict__ d_tweak)
|
||||
{
|
||||
__shared__ __align__(16) uint32_t sharedMemory[1024];
|
||||
cn_aes_gpu_init(sharedMemory);
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U;
|
||||
const uint32_t sub = (threadIdx.x & 7U) << 1U;
|
||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (thread < threads)
|
||||
{
|
||||
const uint32_t batchsize = ITER >> (2 + bfactor);
|
||||
const uint32_t start = partidx * batchsize;
|
||||
const uint32_t end = start + batchsize;
|
||||
uint64_t tweak = d_tweak[thread];
|
||||
|
||||
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
|
||||
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
|
||||
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
|
||||
uint4 B = AS_UINT4(ctx_b);
|
||||
|
||||
uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
|
||||
for (int i = start; i < end; i++) // end = 262144
|
||||
{
|
||||
uint4 C;
|
||||
uint32_t j = (A.x & E2I_MASK) >> 3;
|
||||
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
|
||||
store_variant1(&long_state[j], C ^ B); // st.global
|
||||
MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
|
||||
|
||||
j = (A.x & E2I_MASK) >> 3;
|
||||
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
|
||||
store_variant1(&long_state[j], C ^ B);
|
||||
MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
|
||||
}
|
||||
if (bfactor) {
|
||||
AS_UINT4(ctx_a) = A;
|
||||
AS_UINT4(ctx_b) = B;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
__global__
|
||||
void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
|
||||
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
|
||||
uint64_t * __restrict__ d_tweak)
|
||||
{
|
||||
__shared__ __align__(16) uint32_t sharedMemory[1024];
|
||||
cn_aes_gpu_init(sharedMemory);
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (thread < threads)
|
||||
{
|
||||
const uint32_t batchsize = ITER >> (2 + bfactor);
|
||||
const uint32_t start = partidx * batchsize;
|
||||
const uint32_t end = start + batchsize;
|
||||
uint64_t tweak = d_tweak[thread];
|
||||
|
||||
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
|
||||
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
|
||||
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
|
||||
uint4 B = AS_UINT4(ctx_b);
|
||||
|
||||
uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
|
||||
for (int i = start; i < end; i++) // end = 262144
|
||||
{
|
||||
uint4 C;
|
||||
uint32_t j = (A.x & E2I_MASK) >> 3;
|
||||
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
|
||||
store_variant2(&long_state[j], C ^ B); // st.global
|
||||
MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
|
||||
|
||||
j = (A.x & E2I_MASK) >> 3;
|
||||
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
|
||||
store_variant2(&long_state[j], C ^ B);
|
||||
MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
|
||||
}
|
||||
if (bfactor) {
|
||||
AS_UINT4(ctx_a) = A;
|
||||
AS_UINT4(ctx_b) = B;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
__global__
|
||||
void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state,
|
||||
uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2)
|
||||
{
|
||||
__shared__ uint32_t sharedMemory[1024];
|
||||
|
||||
cn_aes_gpu_init(sharedMemory);
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
|
||||
|
||||
if(thread < threads)
|
||||
{
|
||||
const uint32_t long_oft = (thread << LONG_SHL64) + sub;
|
||||
const uint32_t st_oft = (thread * 26U) + sub + 8U;
|
||||
const int sub = (threadIdx.x & 7) << 2;
|
||||
const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
|
||||
uint32_t key[40], text[4];
|
||||
MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
|
||||
MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
|
||||
|
||||
uint4 key[10];
|
||||
const uint32_t* ctx_key = &ctx_key2[thread * 40U];
|
||||
#pragma unroll 10 // 160 bytes
|
||||
for (int i = 0; i < 10; i++)
|
||||
key[i] = AS_UINT4(&ctx_key[i*4U]);
|
||||
|
||||
uint4 text = AS_UINT4(&ctx_state[st_oft]);
|
||||
|
||||
for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U)
|
||||
for(int i = 0; i < LONG_LOOPS32; i += 32)
|
||||
{
|
||||
uint4 st = AS_UINT4(&long_state[long_oft + i]);
|
||||
text = text ^ st;
|
||||
cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key);
|
||||
#pragma unroll
|
||||
for(int j = 0; j < 4; ++j)
|
||||
text[j] ^= longstate[i + j];
|
||||
|
||||
cn_aes_pseudo_round_mut(sharedMemory, text, key);
|
||||
}
|
||||
|
||||
AS_UINT4(&ctx_state[st_oft]) = text;
|
||||
MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
extern int device_bfactor[MAX_GPUS];
|
||||
|
||||
__host__
|
||||
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state,
|
||||
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
|
||||
void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
|
||||
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
|
||||
{
|
||||
dim3 grid(blocks);
|
||||
dim3 block(threads);
|
||||
//dim3 block2(threads << 1);
|
||||
dim3 block4(threads << 2);
|
||||
dim3 block8(threads << 3);
|
||||
|
||||
const uint32_t bfactor = (uint32_t) device_bfactor[thr_id];
|
||||
const uint32_t partcount = 1 << bfactor;
|
||||
const uint16_t bfactor = (uint16_t) device_bfactor[thr_id];
|
||||
const uint32_t partcount = 1U << bfactor;
|
||||
const uint32_t throughput = (uint32_t) (blocks*threads);
|
||||
|
||||
const int bsleep = bfactor ? 100 : 0;
|
||||
const int dev_id = device_map[thr_id];
|
||||
|
||||
cryptonight_core_gpu_phase1 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key1);
|
||||
cryptonight_gpu_phase1 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
if(partcount > 1) usleep(bsleep);
|
||||
|
||||
for (uint32_t i = 0; i < partcount; i++)
|
||||
{
|
||||
dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
|
||||
cryptonight_core_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
|
||||
if (variant == 0)
|
||||
cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
|
||||
else if (variant == 1 || cryptonight_fork == 8)
|
||||
monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
|
||||
else if (variant == 2 && cryptonight_fork == 3)
|
||||
stellite_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
if(partcount > 1) usleep(bsleep);
|
||||
}
|
||||
|
||||
cryptonight_core_gpu_phase3 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key2);
|
||||
//cudaDeviceSynchronize();
|
||||
//exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cryptonight_gpu_phase3 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
@ -12,6 +12,20 @@ extern "C" {
|
||||
#include "cpu/c_keccak.h"
|
||||
}
|
||||
|
||||
static void cryptonight_store_variant(void* state, int variant) {
|
||||
if (variant == 1 || cryptonight_fork == 8) {
|
||||
// monero, and graft ?
|
||||
const uint8_t tmp = ((const uint8_t*)(state))[11];
|
||||
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
|
||||
((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
|
||||
} else if (variant == 2 && cryptonight_fork == 3) {
|
||||
// stellite
|
||||
const uint8_t tmp = ((const uint8_t*)(state))[11];
|
||||
const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
|
||||
((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
|
||||
}
|
||||
}
|
||||
|
||||
struct cryptonight_ctx {
|
||||
uint8_t long_state[MEMORY];
|
||||
union cn_slow_hash_state state;
|
||||
@ -130,14 +144,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
|
||||
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
|
||||
}
|
||||
|
||||
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
|
||||
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) {
|
||||
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
|
||||
hi += ((uint64_t*) c)[0];
|
||||
|
||||
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
|
||||
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
|
||||
((uint64_t*) dst)[0] = hi;
|
||||
((uint64_t*) dst)[1] = lo;
|
||||
((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo;
|
||||
}
|
||||
|
||||
static void copy_block(uint8_t* dst, const uint8_t* src) {
|
||||
@ -155,13 +169,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
|
||||
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
|
||||
}
|
||||
|
||||
static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx)
|
||||
static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
|
||||
{
|
||||
size_t i, j;
|
||||
if (variant && len < 43)
|
||||
return 0;
|
||||
|
||||
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
|
||||
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
|
||||
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||
|
||||
const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
|
||||
|
||||
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
|
||||
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
|
||||
#undef RND
|
||||
@ -184,14 +203,16 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
|
||||
j = e2i(ctx->a) * AES_BLOCK_SIZE;
|
||||
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
|
||||
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
|
||||
cryptonight_store_variant(&ctx->long_state[j], variant);
|
||||
|
||||
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
|
||||
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2);
|
||||
|
||||
j = e2i(ctx->a) * AES_BLOCK_SIZE;
|
||||
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
|
||||
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
|
||||
cryptonight_store_variant(&ctx->long_state[j], variant);
|
||||
|
||||
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
|
||||
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2);
|
||||
}
|
||||
|
||||
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
|
||||
@ -217,11 +238,38 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
|
||||
if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
|
||||
|
||||
oaes_free((OAES_CTX **) &ctx->aes_ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
void cryptonight_hash(void* output, const void* input, size_t len)
|
||||
int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
|
||||
{
|
||||
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
|
||||
cryptonight_hash_ctx(output, input, len, ctx);
|
||||
int rc = cryptonight_hash_ctx(output, input, len, ctx, variant);
|
||||
free(ctx);
|
||||
return rc;
|
||||
}
|
||||
|
||||
void cryptonight_hash(void* output, const void* input)
|
||||
{
|
||||
cryptonight_fork = 1;
|
||||
cryptonight_hash_variant(output, input, 76, 0);
|
||||
}
|
||||
|
||||
void graft_hash(void* output, const void* input)
|
||||
{
|
||||
cryptonight_fork = 8;
|
||||
cryptonight_hash_variant(output, input, 76, 1);
|
||||
}
|
||||
|
||||
void monero_hash(void* output, const void* input)
|
||||
{
|
||||
cryptonight_fork = 7;
|
||||
cryptonight_hash_variant(output, input, 76, 1);
|
||||
}
|
||||
|
||||
void stellite_hash(void* output, const void* input)
|
||||
{
|
||||
cryptonight_fork = 3;
|
||||
cryptonight_hash_variant(output, input, 76, 2);
|
||||
}
|
||||
|
||||
|
@ -7,15 +7,15 @@
|
||||
|
||||
#include <miner.h>
|
||||
#include <cuda_helper.h>
|
||||
|
||||
#include "cryptonight.h"
|
||||
|
||||
typedef uint8_t BitSequence;
|
||||
typedef uint64_t DataLength;
|
||||
|
||||
static uint32_t *d_input[MAX_GPUS] = { 0 };
|
||||
static uint32_t *d_input[MAX_GPUS];
|
||||
static uint32_t *d_target[MAX_GPUS];
|
||||
static uint32_t *d_result[MAX_GPUS];
|
||||
|
||||
typedef uint8_t BitSequence;
|
||||
typedef uint32_t DataLength;
|
||||
#include "cn_keccak.cuh"
|
||||
#include "cn_blake.cuh"
|
||||
#include "cn_groestl.cuh"
|
||||
@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = {
|
||||
__device__ __forceinline__
|
||||
void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
|
||||
{
|
||||
const uint32_t aes_gf[] = {
|
||||
const uint32_t aes_gf[10] = {
|
||||
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
|
||||
};
|
||||
|
||||
MEMSET4(key, 0, 40);
|
||||
MEMCPY4(key, data, 8);
|
||||
|
||||
#pragma unroll
|
||||
for(int i = 8; i < 40; i++)
|
||||
{
|
||||
@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
|
||||
}
|
||||
|
||||
__global__
|
||||
void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
|
||||
uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
|
||||
uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2)
|
||||
void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce,
|
||||
uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
|
||||
uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
|
||||
if(thread < threads)
|
||||
{
|
||||
uint32_t ctx_state[50];
|
||||
uint64_t ctx_state[25];
|
||||
uint32_t ctx_a[4];
|
||||
uint32_t ctx_b[4];
|
||||
uint32_t ctx_key1[40];
|
||||
@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
|
||||
uint32_t input[19];
|
||||
|
||||
MEMCPY4(input, d_input, 19);
|
||||
*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
|
||||
|
||||
cn_keccak((uint8_t *)input, (uint8_t *)ctx_state);
|
||||
cryptonight_aes_set_key(ctx_key1, ctx_state);
|
||||
cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
|
||||
XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
|
||||
XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
|
||||
uint32_t nonce = startNonce + thread;
|
||||
*(((uint8_t *)input) + 39) = nonce & 0xff;
|
||||
*(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff;
|
||||
*(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff;
|
||||
*(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff;
|
||||
|
||||
MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25);
|
||||
MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4);
|
||||
MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4);
|
||||
MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40);
|
||||
MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40);
|
||||
}
|
||||
}
|
||||
cn_keccak(input, ctx_state);
|
||||
MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50);
|
||||
|
||||
__global__
|
||||
void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if(thread < threads)
|
||||
{
|
||||
uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]);
|
||||
uint64_t state[25];
|
||||
#pragma unroll
|
||||
for(int i = 0; i < 25; i++)
|
||||
state[i] = ctx_state[i];
|
||||
cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0]));
|
||||
cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4]));
|
||||
MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40);
|
||||
MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40);
|
||||
|
||||
cn_keccakf2(state);
|
||||
XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a);
|
||||
XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b);
|
||||
MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4);
|
||||
MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4);
|
||||
|
||||
// to reduce the final kernel stack frame, cut algos in 2 kernels
|
||||
// ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%)
|
||||
switch (((uint8_t*)state)[0] & 0x03)
|
||||
{
|
||||
case 0: {
|
||||
uint32_t hash[8];
|
||||
cn_blake((uint8_t*)state, 200, (uint8_t*)hash);
|
||||
((uint32_t*)ctx_state)[0] = 0;
|
||||
((uint32_t*)ctx_state)[6] = hash[6];
|
||||
((uint32_t*)ctx_state)[7] = hash[7];
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
uint32_t hash[8];
|
||||
cn_groestl((BitSequence*)state, 200, (BitSequence*)hash);
|
||||
((uint32_t*)ctx_state)[0] = 0;
|
||||
((uint32_t*)ctx_state)[6] = hash[6];
|
||||
((uint32_t*)ctx_state)[7] = hash[7];
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
#pragma unroll
|
||||
for(int i = 0; i < 25; i++)
|
||||
ctx_state[i] = state[i];
|
||||
}
|
||||
if (variant) {
|
||||
uint2 tweak = AS_UINT2(&ctx_state[24]);
|
||||
//tweak.x ^= (input[8] >> 24) | (input[9] << 8);
|
||||
tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543);
|
||||
tweak.y ^= __byte_perm(input[9], input[10], 0x6543);
|
||||
MEMCPY4(&d_ctx_tweak[thread], &tweak, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__
|
||||
void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state,
|
||||
const uint32_t* d_target, uint32_t * resNonces)
|
||||
void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target,
|
||||
uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state)
|
||||
{
|
||||
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if(thread < threads)
|
||||
{
|
||||
uint64_t* const state = &d_ctx_state[thread * 26U];
|
||||
|
||||
uint32_t *ctx_state = &d_ctx_state[thread * 50U];
|
||||
uint32_t hash[8];
|
||||
switch(((uint8_t *)state)[0] & 0x03)
|
||||
{
|
||||
case 0: {
|
||||
uint32_t* h32 = (uint32_t*)state;
|
||||
hash[6] = h32[6];
|
||||
hash[7] = h32[7];
|
||||
break;
|
||||
}
|
||||
case 2: {
|
||||
cn_jh256((uint8_t*)state, 200, hash);
|
||||
break;
|
||||
}
|
||||
case 3: {
|
||||
cn_skein((uint8_t*)state, 200, hash);
|
||||
break;
|
||||
}
|
||||
}
|
||||
uint32_t state[50];
|
||||
|
||||
#pragma unroll 25
|
||||
for(int i = 0; i < 50; i+=2)
|
||||
AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]);
|
||||
|
||||
cn_keccakf2((uint64_t *)state);
|
||||
|
||||
int branch = ((uint8_t *)state)[0] & 0x03;
|
||||
if(branch == 0)
|
||||
cn_blake((const uint8_t *)state, 200, hash);
|
||||
if(branch == 1)
|
||||
cn_groestl((const uint8_t *)state, 200, hash);
|
||||
if(branch == 2)
|
||||
cn_jh((const uint8_t *)state, 200, hash);
|
||||
if(branch == 3)
|
||||
cn_skein((const uint8_t *)state, 200, hash);
|
||||
|
||||
if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
|
||||
{
|
||||
@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
|
||||
}
|
||||
|
||||
__host__
|
||||
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget)
|
||||
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget)
|
||||
{
|
||||
uint32_t *pTargetIn = (uint32_t*) ptarget;
|
||||
cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice);
|
||||
cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t));
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
||||
__host__
|
||||
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
|
||||
void cryptonight_extra_init(int thr_id)
|
||||
{
|
||||
cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t));
|
||||
cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t));
|
||||
cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t));
|
||||
cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t));
|
||||
cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t));
|
||||
cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t));
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
||||
__host__
|
||||
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
|
||||
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
|
||||
{
|
||||
uint32_t threadsperblock = 128;
|
||||
|
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
cryptonight_extra_gpu_prepare <<<grid, block >>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2);
|
||||
cryptonight_extra_gpu_prepare <<<grid, block>>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
||||
__host__
|
||||
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state)
|
||||
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state)
|
||||
{
|
||||
uint32_t threadsperblock = 128;
|
||||
|
||||
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, (uint32_t*)d_ctx_state);
|
||||
cryptonight_extra_gpu_final <<<grid, block>>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
}
|
||||
|
||||
__host__
|
||||
void cryptonight_extra_cpu_free(int thr_id)
|
||||
void cryptonight_extra_free(int thr_id)
|
||||
{
|
||||
if (d_input[thr_id]) {
|
||||
cudaFree(d_input[thr_id]);
|
||||
@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id)
|
||||
cudaFree(d_result[thr_id]);
|
||||
d_input[thr_id] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false;
|
||||
gpulog(p, thr, fmt, ##__VA_ARGS__)
|
||||
|
||||
static uint64_t *d_long_state[MAX_GPUS];
|
||||
static uint64_t *d_ctx_state[MAX_GPUS];
|
||||
static uint32_t *d_ctx_state[MAX_GPUS];
|
||||
static uint32_t *d_ctx_key1[MAX_GPUS];
|
||||
static uint32_t *d_ctx_key2[MAX_GPUS];
|
||||
static uint32_t *d_ctx_text[MAX_GPUS];
|
||||
static uint64_t *d_ctx_tweak[MAX_GPUS];
|
||||
static uint32_t *d_ctx_a[MAX_GPUS];
|
||||
static uint32_t *d_ctx_b[MAX_GPUS];
|
||||
|
||||
static bool init[MAX_GPUS] = { 0 };
|
||||
|
||||
extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
|
||||
extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
|
||||
{
|
||||
int res = 0;
|
||||
uint32_t throughput = 0;
|
||||
@ -49,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
|
||||
gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
|
||||
mem, device_mpcount[dev_id]);
|
||||
|
||||
if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
|
||||
device_config[thr_id] = strdup("80x24");
|
||||
}
|
||||
|
||||
if (device_config[thr_id]) {
|
||||
int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
|
||||
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
|
||||
@ -70,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
|
||||
exit(1);
|
||||
}
|
||||
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
cudaSetDevice(dev_id);
|
||||
if (opt_cudaschedule == -1 && gpu_threads == 1) {
|
||||
cudaDeviceReset();
|
||||
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
|
||||
@ -79,11 +84,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
|
||||
}
|
||||
|
||||
const size_t alloc = MEMORY * throughput;
|
||||
cryptonight_extra_cpu_init(thr_id, throughput);
|
||||
cryptonight_extra_init(thr_id);
|
||||
|
||||
cudaMalloc(&d_long_state[thr_id], alloc);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16)
|
||||
cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
@ -95,6 +100,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
|
||||
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
|
||||
cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
|
||||
exit_if_cudaerror(thr_id, __FILE__, __LINE__);
|
||||
|
||||
gpu_init_shown = true;
|
||||
init[thr_id] = true;
|
||||
@ -107,10 +114,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
|
||||
|
||||
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
|
||||
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
|
||||
cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
|
||||
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
|
||||
cryptonight_extra_setData(thr_id, pdata, ptarget);
|
||||
cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
|
||||
cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
|
||||
cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
|
||||
|
||||
*hashes_done = nonce - first_nonce + throughput;
|
||||
|
||||
@ -121,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
|
||||
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
|
||||
memcpy(tempdata, pdata, 76);
|
||||
*tempnonceptr = resNonces[0];
|
||||
cryptonight_hash(vhash, tempdata, 76);
|
||||
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
|
||||
const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
|
||||
if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget))
|
||||
{
|
||||
res = 1;
|
||||
work->nonces[0] = resNonces[0];
|
||||
@ -131,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
|
||||
if(resNonces[1] != UINT32_MAX)
|
||||
{
|
||||
*tempnonceptr = resNonces[1];
|
||||
cryptonight_hash(vhash, tempdata, 76);
|
||||
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
|
||||
const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
|
||||
if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) {
|
||||
res++;
|
||||
work->nonces[1] = resNonces[1];
|
||||
} else {
|
||||
@ -174,10 +181,11 @@ void free_cryptonight(int thr_id)
|
||||
cudaFree(d_ctx_key1[thr_id]);
|
||||
cudaFree(d_ctx_key2[thr_id]);
|
||||
cudaFree(d_ctx_text[thr_id]);
|
||||
cudaFree(d_ctx_tweak[thr_id]);
|
||||
cudaFree(d_ctx_a[thr_id]);
|
||||
cudaFree(d_ctx_b[thr_id]);
|
||||
|
||||
cryptonight_extra_cpu_free(thr_id);
|
||||
cryptonight_extra_free(thr_id);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
|
@ -20,7 +20,6 @@ struct uint3 blockDim;
|
||||
#define __umul64hi(a,b) a*b
|
||||
#endif
|
||||
|
||||
|
||||
#define MEMORY (1U << 21) // 2 MiB / 2097152 B
|
||||
#define ITER (1U << 20) // 1048576
|
||||
#define E2I_MASK 0x1FFFF0u
|
||||
@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
|
||||
void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
|
||||
|
||||
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
|
||||
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
|
||||
void cryptonight_extra_cpu_free(int thr_id);
|
||||
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
|
||||
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
|
||||
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
|
||||
void cryptonight_extra_init(int thr_id);
|
||||
void cryptonight_extra_free(int thr_id);
|
||||
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
|
||||
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);
|
||||
|
@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
|
||||
}
|
||||
|
||||
else if (opt_algo == ALGO_CRYPTOLIGHT) {
|
||||
int variant = 1;
|
||||
uint32_t nonce = work->nonces[idnonce];
|
||||
noncestr = bin2hex((unsigned char*) &nonce, 4);
|
||||
last_found_nonce = nonce;
|
||||
cryptolight_hash(hash, data, 76);
|
||||
//if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
|
||||
// variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
|
||||
cryptolight_hash_variant(hash, data, 76, variant);
|
||||
work_set_target_ratio(work, (uint32_t*) hash);
|
||||
}
|
||||
|
||||
else if (opt_algo == ALGO_CRYPTONIGHT) {
|
||||
int variant = 0;
|
||||
uint32_t nonce = work->nonces[idnonce];
|
||||
noncestr = bin2hex((unsigned char*) &nonce, 4);
|
||||
last_found_nonce = nonce;
|
||||
cryptonight_hash(hash, data, 76);
|
||||
if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
|
||||
variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
|
||||
cryptonight_hash_variant(hash, data, 76, variant);
|
||||
work_set_target_ratio(work, (uint32_t*) hash);
|
||||
}
|
||||
|
||||
|
14
miner.h
14
miner.h
@ -279,8 +279,8 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce,
|
||||
extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||
extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||
extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||
extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||
extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
|
||||
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
|
||||
extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||
extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||
extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
|
||||
@ -575,6 +575,8 @@ extern uint32_t device_plimit[MAX_GPUS];
|
||||
extern uint32_t gpus_intensity[MAX_GPUS];
|
||||
extern int opt_cudaschedule;
|
||||
|
||||
extern int cryptonight_fork;
|
||||
|
||||
// cuda.cpp
|
||||
int cuda_num_devices();
|
||||
void cuda_devicenames();
|
||||
@ -898,8 +900,12 @@ void blake2b_hash(void *output, const void *input);
|
||||
void blake2s_hash(void *output, const void *input);
|
||||
void bmw_hash(void *state, const void *input);
|
||||
void c11hash(void *output, const void *input);
|
||||
void cryptolight_hash(void* output, const void* input, int len);
|
||||
void cryptonight_hash(void* output, const void* input, size_t len);
|
||||
int cryptolight_hash_variant(void* output, const void* input, int len, int variant);
|
||||
void cryptolight_hash(void* output, const void* input);
|
||||
int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
|
||||
void cryptonight_hash(void* output, const void* input);
|
||||
void monero_hash(void* output, const void* input);
|
||||
void stellite_hash(void* output, const void* input);
|
||||
void decred_hash(void *state, const void *input);
|
||||
void deephash(void *state, const void *input);
|
||||
void luffa_hash(void *state, const void *input);
|
||||
|
10
util.cpp
10
util.cpp
@ -2193,10 +2193,10 @@ void print_hash_tests(void)
|
||||
c11hash(&hash[0], &buf[0]);
|
||||
printpfx("c11", hash);
|
||||
|
||||
cryptolight_hash(&hash[0], &buf[0], 76);
|
||||
cryptolight_hash(&hash[0], &buf[0]);
|
||||
printpfx("cryptolight", hash);
|
||||
|
||||
cryptonight_hash(&hash[0], &buf[0], 76);
|
||||
cryptonight_hash(&hash[0], &buf[0]);
|
||||
printpfx("cryptonight", hash);
|
||||
|
||||
memset(buf, 0, 180);
|
||||
@ -2246,6 +2246,9 @@ void print_hash_tests(void)
|
||||
lyra2Z_hash(&hash[0], &buf[0]);
|
||||
printpfx("lyra2z", hash);
|
||||
|
||||
monero_hash(&hash[0], &buf[0]);
|
||||
printpfx("monero", hash);
|
||||
|
||||
myriadhash(&hash[0], &buf[0]);
|
||||
printpfx("myriad", hash);
|
||||
|
||||
@ -2297,6 +2300,9 @@ void print_hash_tests(void)
|
||||
skunk_hash(&hash[0], &buf[0]);
|
||||
printpfx("skunk", hash);
|
||||
|
||||
stellite_hash(&hash[0], &buf[0]);
|
||||
printpfx("stelitte", hash);
|
||||
|
||||
s3hash(&hash[0], &buf[0]);
|
||||
printpfx("S3", hash);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user