Browse Source

handle new cryptonight variants, stellite, aeon

special thanks for klausT changes and ystarnaud who helped me to adapt my kernel variants...

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
pull/5/head
Tanguy Pruvot 6 years ago
parent
commit
77c4b8724e
  1. 31
      algos.h
  2. 41
      ccminer.cpp
  3. 1
      crypto/cn_aes.cuh
  4. 2
      crypto/cn_blake.cuh
  5. 9
      crypto/cn_groestl.cuh
  6. 20
      crypto/cn_jh.cuh
  7. 2
      crypto/cn_keccak.cuh
  8. 22
      crypto/cn_skein.cuh
  9. 86
      crypto/cryptolight-core.cu
  10. 39
      crypto/cryptolight-cpu.cpp
  11. 30
      crypto/cryptolight.cu
  12. 13
      crypto/cryptolight.h
  13. 236
      crypto/cryptonight-core.cu
  14. 62
      crypto/cryptonight-cpu.cpp
  15. 175
      crypto/cryptonight-extra.cu
  16. 36
      crypto/cryptonight.cu
  17. 13
      crypto/cryptonight.h
  18. 10
      crypto/xmr-rpc.cpp
  19. 14
      miner.h
  20. 10
      util.cpp

31
algos.h

@ -72,6 +72,9 @@ enum sha_algos {
ALGO_WHIRLPOOLX, ALGO_WHIRLPOOLX,
ALGO_WILDKECCAK, ALGO_WILDKECCAK,
ALGO_ZR5, ALGO_ZR5,
ALGO_MONERO,
ALGO_GRAFT,
ALGO_STELLITE,
ALGO_AUTO, ALGO_AUTO,
ALGO_COUNT ALGO_COUNT
}; };
@ -146,6 +149,9 @@ static const char *algo_names[] = {
"whirlpoolx", "whirlpoolx",
"wildkeccak", "wildkeccak",
"zr5", "zr5",
"monero",
"graft",
"stellite",
"auto", /* reserved for multi algo */ "auto", /* reserved for multi algo */
"" ""
}; };
@ -206,4 +212,29 @@ static inline int algo_to_int(char* arg)
return i; return i;
} }
static inline int get_cryptonight_algo(int fork)
{
int algo = ALGO_COUNT;
switch (fork) {
case 8:
algo = ALGO_GRAFT;
break;
case 7:
algo = ALGO_MONERO;
break;
case 3:
algo = ALGO_STELLITE;
break;
default:
algo = ALGO_CRYPTONIGHT;
break;
}
return algo;
}
#endif #endif

41
ccminer.cpp

@ -233,6 +233,8 @@ int opt_api_mcast_port = 4068;
bool opt_stratum_stats = false; bool opt_stratum_stats = false;
int cryptonight_fork = 1;
static char const usage[] = "\ static char const usage[] = "\
Usage: " PROGRAM_NAME " [OPTIONS]\n\ Usage: " PROGRAM_NAME " [OPTIONS]\n\
Options:\n\ Options:\n\
@ -245,7 +247,7 @@ Options:\n\
blakecoin Fast Blake 256 (8 rounds)\n\ blakecoin Fast Blake 256 (8 rounds)\n\
bmw BMW 256\n\ bmw BMW 256\n\
cryptolight AEON cryptonight (MEM/2)\n\ cryptolight AEON cryptonight (MEM/2)\n\
cryptonight XMR cryptonight\n\ cryptonight XMR cryptonight v1 (old)\n\
c11/flax X11 variant\n\ c11/flax X11 variant\n\
decred Decred Blake256\n\ decred Decred Blake256\n\
deep Deepcoin\n\ deep Deepcoin\n\
@ -253,6 +255,7 @@ Options:\n\
dmd-gr Diamond-Groestl\n\ dmd-gr Diamond-Groestl\n\
fresh Freshcoin (shavite 80)\n\ fresh Freshcoin (shavite 80)\n\
fugue256 Fuguecoin\n\ fugue256 Fuguecoin\n\
graft Cryptonight v8\n\
groestl Groestlcoin\n" groestl Groestlcoin\n"
#ifdef WITH_HEAVY_ALGO #ifdef WITH_HEAVY_ALGO
" heavy Heavycoin\n" " heavy Heavycoin\n"
@ -267,6 +270,7 @@ Options:\n\
lyra2v2 VertCoin\n\ lyra2v2 VertCoin\n\
lyra2z ZeroCoin (3rd impl)\n\ lyra2z ZeroCoin (3rd impl)\n\
myr-gr Myriad-Groestl\n\ myr-gr Myriad-Groestl\n\
monero XMR cryptonight (v7)\n\
neoscrypt FeatherCoin, Phoenix, UFO...\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\
nist5 NIST5 (TalkCoin)\n\ nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\ penta Pentablake hash (5x Blake 512)\n\
@ -284,6 +288,7 @@ Options:\n\
skein Skein SHA2 (Skeincoin)\n\ skein Skein SHA2 (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\ skein2 Double Skein (Woodcoin)\n\
skunk Skein Cube Fugue Streebog\n\ skunk Skein Cube Fugue Streebog\n\
stellite Cryptonight v3\n\
s3 S3 (1Coin)\n\ s3 S3 (1Coin)\n\
timetravel Machinecoin permuted x8\n\ timetravel Machinecoin permuted x8\n\
tribus Denarius\n\ tribus Denarius\n\
@ -573,7 +578,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work);
void get_currentalgo(char* buf, int sz) void get_currentalgo(char* buf, int sz)
{ {
snprintf(buf, sz, "%s", algo_names[opt_algo]); int algo = opt_algo;
if (algo == ALGO_CRYPTONIGHT)
algo = get_cryptonight_algo(cryptonight_fork);
snprintf(buf, sz, "%s", algo_names[algo]);
} }
void format_hashrate(double hashrate, char *output) void format_hashrate(double hashrate, char *output)
@ -2372,11 +2380,16 @@ static void *miner_thread(void *userdata)
rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
break; break;
case ALGO_CRYPTOLIGHT: case ALGO_CRYPTOLIGHT:
rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
break; break;
case ALGO_CRYPTONIGHT: case ALGO_CRYPTONIGHT:
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done); {
int cn_variant = 0;
if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork)
cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1;
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant);
break; break;
}
case ALGO_DECRED: case ALGO_DECRED:
rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
break; break;
@ -3138,6 +3151,26 @@ void parse_arg(int key, char *arg)
case ALGO_SCRYPT_JANE: opt_nfactor = 14; break; case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
} }
} }
// cryptonight variants
switch (opt_algo) {
case ALGO_MONERO:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 7;
break;
case ALGO_GRAFT:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 8;
break;
case ALGO_STELLITE:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 3;
break;
case ALGO_CRYPTONIGHT:
cryptonight_fork = 1;
break;
}
break; break;
case 'b': case 'b':
p = strstr(arg, ":"); p = strstr(arg, ":");

1
crypto/cn_aes.cuh

@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
*/ */
#define AS_U32(addr) *((uint32_t*)(addr)) #define AS_U32(addr) *((uint32_t*)(addr))
#define AS_U64(addr) *((uint64_t*)(addr))
#define AS_UINT2(addr) *((uint2*)(addr)) #define AS_UINT2(addr) *((uint2*)(addr))
#define AS_UINT4(addr) *((uint4*)(addr)) #define AS_UINT4(addr) *((uint4*)(addr))
#define AS_UL2(addr) *((ulonglong2*)(addr)) #define AS_UL2(addr) *((ulonglong2*)(addr))

2
crypto/cn_blake.cuh

@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
} }
__device__ __device__
void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out) void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out)
{ {
blake_state bs; blake_state bs;
blake_state *S = (blake_state *)&bs; blake_state *S = (blake_state *)&bs;

9
crypto/cn_groestl.cuh

@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri
for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) { for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
output[j] = s[i]; output[j] = s[i];
} }
#if 0
for (i = 0; i < GROESTL_COLS512; i++) { for (i = 0; i < GROESTL_COLS512; i++) {
ctx->chaining[i] = 0; ctx->chaining[i] = 0;
} }
for (i = 0; i < GROESTL_SIZE512; i++) { for (i = 0; i < GROESTL_SIZE512; i++) {
ctx->buffer[i] = 0; ctx->buffer[i] = 0;
} }
#endif
} }
__device__ __device__
@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx)
} }
__device__ __device__
void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{ {
DataLength databitlen = len << 3; DataLength databitlen = len << 3;
groestlHashState context; groestlHashState context;
cn_groestl_init(&context); cn_groestl_init(&context);
cn_groestl_update(&context, data, databitlen); cn_groestl_update(&context, (BitSequence*) data, databitlen);
cn_groestl_final(&context, hashval); cn_groestl_final(&context, (BitSequence*) hashval);
} }

20
crypto/cn_jh.cuh

@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
databitlen = 0; databitlen = 0;
} }
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) { if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) )
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ; {
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
index = 64-(state->datasize_in_buffer >> 3); index = 64-(state->datasize_in_buffer >> 3);
databitlen = databitlen - (512 - state->datasize_in_buffer); databitlen = databitlen - (512 - state->datasize_in_buffer);
cn_jh_F8(state); cn_jh_F8(state);
@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
/* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */ /* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
__device__ __device__
void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval) void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval)
{ {
unsigned int i; unsigned int i;
//uint32_t *bufptr = (uint32_t *)state->buffer; //uint32_t *bufptr = (uint32_t *)state->buffer;
@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
} else { } else {
/*set the rest of the bytes in the buffer to 0*/ /* set the rest of the bytes in the buffer to 0 */
if ( (state->datasize_in_buffer & 7) == 0) { if ( (state->datasize_in_buffer & 7) == 0) {
for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
} else { } else {
@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
cn_jh_F8(state); cn_jh_F8(state);
} }
MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8); memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32);
//MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
} }
__device__ __device__
@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen)
state->databitlen = 0; state->databitlen = 0;
state->datasize_in_buffer = 0; state->datasize_in_buffer = 0;
state->hashbitlen = hashbitlen; state->hashbitlen = hashbitlen;
//memcpy(state->x, d_JH256_H0, 128); memcpy(state->x, d_JH256_H0, 128);
MEMCPY8(state->x, d_JH256_H0, 128 / 8); //MEMCPY8(state->x, d_JH256_H0, 128 / 8);
} }
__device__ __device__
void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval) void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{ {
const int hashbitlen = 256; const int hashbitlen = 256;
DataLength databitlen = len << 3; DataLength databitlen = len << 3;
@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
cn_jh_init(&state, hashbitlen); cn_jh_init(&state, hashbitlen);
cn_jh_update(&state, data, databitlen); cn_jh_update(&state, data, databitlen);
cn_jh_final(&state, hashval); cn_jh_final(&state, (uint8_t*) hashval);
} }

2
crypto/cn_keccak.cuh

@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s)
} }
__device__ __forceinline__ __device__ __forceinline__
void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md) void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md)
{ {
uint64_t st[25]; uint64_t st[25];

22
crypto/cn_skein.cuh

@ -4,19 +4,15 @@ typedef unsigned int uint_t; /* native unsigned integer */
#define SKEIN_256_STATE_WORDS ( 4) #define SKEIN_256_STATE_WORDS ( 4)
#define SKEIN_512_STATE_WORDS ( 8) #define SKEIN_512_STATE_WORDS ( 8)
#define SKEIN1024_STATE_WORDS (16)
#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) #define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) #define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) #define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) #define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS)
#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) #define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) #define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) #define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
@ -119,7 +115,7 @@ typedef struct {
} skeinHashState; } skeinHashState;
__device__ __device__
void cn_skein256_init(skeinHashState *state, size_t hashBitLen) void cn_skein_init(skeinHashState *state, size_t hashBitLen)
{ {
const uint64_t SKEIN_512_IV_256[] = const uint64_t SKEIN_512_IV_256[] =
{ {
@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr
} }
__device__ __device__
void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen) void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
{ {
if ((databitlen & 7) == 0) { if ((databitlen & 7) == 0) {
cn_skein_block(&state->u.ctx_512, data, databitlen >> 3); cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
} }
else { else {
size_t bCnt = (databitlen >> 3) + 1; size_t bCnt = (databitlen >> 3) + 1;
uint8_t b,mask; uint8_t b,mask;
@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r
} }
__device__ __device__
void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal) void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
{ {
uint64_t X[SKEIN_512_STATE_WORDS]; uint64_t X[SKEIN_512_STATE_WORDS];
Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric
((uint64_t *)ctx->b)[0] = (uint64_t)i; ((uint64_t *)ctx->b)[0] = (uint64_t)i;
Skein_Start_New_Type(ctx, OUT_FINAL); Skein_Start_New_Type(ctx, OUT_FINAL);
cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t)); cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n); memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n);
memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
} }
} }
__device__ __device__
void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval) void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{ {
int hashbitlen = 256; int hashbitlen = 256;
DataLength databitlen = len << 3; DataLength databitlen = len << 3;
@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
state.statebits = 64*SKEIN_512_STATE_WORDS; state.statebits = 64*SKEIN_512_STATE_WORDS;
cn_skein256_init(&state, hashbitlen); cn_skein_init(&state, hashbitlen);
cn_skein256_update(&state, data, databitlen); cn_skein_update(&state, data, databitlen);
cn_skein256_final(&state, hashval); cn_skein_final(&state, (uint8_t*) hashval);
} }

86
crypto/cryptolight-core.cu

@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
if(thread < threads) if(thread < threads)
{ {
const int oft = thread * 52 + sub + 16; // not aligned 16! const int oft = thread * 50 + sub + 16; // not aligned 16!
const int long_oft = (thread << LONG_SHL_IDX) + sub; const int long_oft = (thread << LONG_SHL_IDX) + sub;
uint32_t __align__(16) key[40]; uint32_t __align__(16) key[40];
uint32_t __align__(16) text[4]; uint32_t __align__(16) text[4];
@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
} }
} }
// --------------------------------------------------------------------------------------------------------------
__global__ __global__
void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
{ {
__shared__ uint32_t __align__(16) sharedMemory[1024]; __shared__ uint32_t __align__(16) sharedMemory[1024];
@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
#endif // __CUDA_ARCH__ >= 300 #endif // __CUDA_ARCH__ >= 300
} }
__device__ __forceinline__ void store_variant1(uint32_t* long_state)
{
uint4* Z = (uint4*) long_state;
const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
}
#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
hi += ((uint64_t *)c)[0]; \
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
((uint64_t *)dst)[0] = hi; \
((uint64_t *)dst)[1] = lo ^ tweak; }
__global__
void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
const uint32_t longptr = thread << LONG_SHL_IDX;
uint32_t * long_state = &d_long_state[longptr];
uint64_t tweak = d_tweak[thread];
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
uint32_t* a = (uint32_t*)&A;
uint32_t* b = (uint32_t*)&B;
for (int i = start; i < end; i++)
{
uint32_t c[4];
uint32_t j = (A.x >> 2) & E2I_MASK2;
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
XOR_BLOCKS_DST(c, b, &long_state[j]);
store_variant1(&long_state[j]);
MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
j = (A.x >> 2) & E2I_MASK2;
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
XOR_BLOCKS_DST(b, c, &long_state[j]);
store_variant1(&long_state[j]);
MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
__global__ __global__
void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2) void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
{ {
@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
if(thread < threads) if(thread < threads)
{ {
const int long_oft = (thread << LONG_SHL_IDX) + sub; const int long_oft = (thread << LONG_SHL_IDX) + sub;
const int oft = thread * 52 + sub + 16; const int oft = thread * 50 + sub + 16;
uint32_t __align__(16) key[40]; uint32_t __align__(16) key[40];
uint32_t __align__(16) text[4]; uint32_t __align__(16) text[4];
@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
extern int device_bfactor[MAX_GPUS]; extern int device_bfactor[MAX_GPUS];
__host__ __host__
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{ {
dim3 grid(blocks); dim3 grid(blocks);
dim3 block(threads); dim3 block(threads);
@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
int i, partcount = 1 << bfactor; int i, partcount = 1 << bfactor;
int dev_id = device_map[thr_id]; int dev_id = device_map[thr_id];
cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1); cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep); if(partcount > 1) usleep(bsleep);
for(i = 0; i < partcount; i++) for(i = 0; i < partcount; i++)
{ {
cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
if (variant == 0)
cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
else
cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep); if(partcount > 1) usleep(bsleep);
} }
cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2); cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
} }

39
crypto/cryptolight-cpu.cpp

@ -22,6 +22,16 @@ struct cryptonight_ctx {
oaes_ctx* aes_ctx; oaes_ctx* aes_ctx;
}; };
static void cryptolight_store_variant(void* state, int variant) {
if (variant == 1) {
// use variant 1 like monero since june 2018
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
}
}
static void do_blake_hash(const void* input, int len, void* output) static void do_blake_hash(const void* input, int len, void* output)
{ {
uchar hash[32]; uchar hash[32];
@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; ((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
} }
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) { static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0]; hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi; ((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo; ((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
} }
static void copy_block(uint8_t* dst, const uint8_t* src) { static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
} }
static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx) static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
{ {
size_t i, j; size_t i, j;
if (variant && len < 43)
return 0;
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND #undef RND
@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
j = e2i(ctx->a); j = e2i(ctx->a);
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
cryptolight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]); mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
j = e2i(ctx->a); j = e2i(ctx->a);
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
cryptolight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]); mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
} }
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
oaes_free((OAES_CTX **) &ctx->aes_ctx); oaes_free((OAES_CTX **) &ctx->aes_ctx);
return 1;
} }
void cryptolight_hash(void* output, const void* input, int len) int cryptolight_hash_variant(void* output, const void* input, int len, int variant)
{ {
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
cryptolight_hash_ctx(output, input, len, ctx); int rc = cryptolight_hash_ctx(output, input, len, ctx, variant);
free(ctx); free(ctx);
return rc;
} }
void cryptolight_hash(void* output, const void* input)
{
cryptolight_hash_variant(output, input, 76, 1);
}

30
crypto/cryptolight.cu

@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks = 32;
static __thread uint32_t cn_threads = 16; static __thread uint32_t cn_threads = 16;
static uint32_t *d_long_state[MAX_GPUS]; static uint32_t *d_long_state[MAX_GPUS];
static uint64_t *d_ctx_state[MAX_GPUS]; static uint32_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_key1[MAX_GPUS]; static uint32_t *d_ctx_key1[MAX_GPUS];
static uint32_t *d_ctx_key2[MAX_GPUS]; static uint32_t *d_ctx_key2[MAX_GPUS];
static uint32_t *d_ctx_text[MAX_GPUS]; static uint32_t *d_ctx_text[MAX_GPUS];
static uint64_t *d_ctx_tweak[MAX_GPUS];
static uint32_t *d_ctx_a[MAX_GPUS]; static uint32_t *d_ctx_a[MAX_GPUS];
static uint32_t *d_ctx_b[MAX_GPUS]; static uint32_t *d_ctx_b[MAX_GPUS];
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
{ {
int res = 0; int res = 0;
uint32_t throughput = 0; uint32_t throughput = 0;
@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
uint32_t *nonceptr = (uint32_t*) (&pdata[39]); uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
const uint32_t first_nonce = *nonceptr; const uint32_t first_nonce = *nonceptr;
uint32_t nonce = first_nonce; uint32_t nonce = first_nonce;
int dev_id = device_map[thr_id];
if(opt_benchmark) { if(opt_benchmark) {
ptarget[7] = 0x00ff; ptarget[7] = 0x00ff;
@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
if(!init[thr_id]) if(!init[thr_id])
{ {
if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
device_config[thr_id] = strdup("80x32");
}
if (device_config[thr_id]) { if (device_config[thr_id]) {
sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
} }
const size_t alloc = MEMORY * throughput; const size_t alloc = MEMORY * throughput;
cryptonight_extra_cpu_init(thr_id, throughput); cryptonight_extra_init(thr_id);
cudaMalloc(&d_long_state[thr_id], alloc); cudaMalloc(&d_long_state[thr_id], alloc);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput); cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
init[thr_id] = true; init[thr_id] = true;
} }
@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget); cryptonight_extra_setData(thr_id, pdata, ptarget);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
*hashes_done = nonce - first_nonce + throughput; *hashes_done = nonce - first_nonce + throughput;
@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
memcpy(tempdata, pdata, 76); memcpy(tempdata, pdata, 76);
*tempnonceptr = resNonces[0]; *tempnonceptr = resNonces[0];
cryptolight_hash(vhash, tempdata, 76); cryptolight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
{ {
res = 1; res = 1;
@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
if(resNonces[1] != UINT32_MAX) if(resNonces[1] != UINT32_MAX)
{ {
*tempnonceptr = resNonces[1]; *tempnonceptr = resNonces[1];
cryptolight_hash(vhash, tempdata, 76); cryptolight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
res++; res++;
work->nonces[1] = resNonces[1]; work->nonces[1] = resNonces[1];
@ -157,10 +164,11 @@ void free_cryptolight(int thr_id)
cudaFree(d_ctx_key1[thr_id]); cudaFree(d_ctx_key1[thr_id]);
cudaFree(d_ctx_key2[thr_id]); cudaFree(d_ctx_key2[thr_id]);
cudaFree(d_ctx_text[thr_id]); cudaFree(d_ctx_text[thr_id]);
cudaFree(d_ctx_tweak[thr_id]);
cudaFree(d_ctx_a[thr_id]); cudaFree(d_ctx_a[thr_id]);
cudaFree(d_ctx_b[thr_id]); cudaFree(d_ctx_b[thr_id]);
cryptonight_extra_cpu_free(thr_id); cryptonight_extra_free(thr_id);
cudaDeviceSynchronize(); cudaDeviceSynchronize();

13
crypto/cryptolight.h

@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
exit(1); exit(1);
} }
} }
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
void cryptonight_extra_cpu_free(int thr_id); void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state); void cryptonight_extra_free(int thr_id);
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);

236
crypto/cryptonight-core.cu

@ -2,47 +2,55 @@
#include <stdint.h> #include <stdint.h>
#include <string.h> #include <string.h>
#include <sys/time.h> #include <sys/time.h>
#ifndef _WIN32
#include <unistd.h> #include <unistd.h>
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
#undef __shfl
#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
#endif
#include "cryptonight.h" #include "cryptonight.h"
#define LONG_SHL32 19 // 1<<19 #define LONG_SHL32 19 // 1<<19 (uint32_t* index)
#define LONG_SHL64 18 // 1<<18 (uint64_t* index) #define LONG_SHL64 18 // 1<<18 (uint64_t* index)
#define LONG_LOOPS32 0x80000U #define LONG_LOOPS32 0x80000U
#define LONG_LOOPS64 0x40000U
#include "cn_aes.cuh" #include "cn_aes.cuh"
__global__ __global__
//__launch_bounds__(128, 9) // 56 registers void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state,
void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1) uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
{ {
__shared__ __align__(16) uint32_t sharedMemory[1024]; __shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14
if(thread < threads) if(thread < threads)
{ {
const uint32_t long_oft = (thread << LONG_SHL64) + sub; cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t* ctx_key = &ctx_key1[thread * 40U];
uint4 keys[10];
#pragma unroll 10 // load 160 bytes
for (int i = 0; i < 10; i ++)
keys[i] = AS_UINT4(&ctx_key[i*4]);
uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]); const uint32_t sub = (threadIdx.x & 0x7U) << 2;
uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
uint32_t __align__(8) key[40];
MEMCPY8(key, &ctx_key1[thread * 40U], 20);
uint32_t __align__(8) text[4];
MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2);
for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) { for(int i = 0; i < LONG_LOOPS32; i += 32)
cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys); {
AS_UINT4(&long_state[long_oft + i]) = text; cn_aes_pseudo_round_mut(sharedMemory, text, key);
MEMCPY8(&longstate[i], text, 2);
} }
} }
} }
// --------------------------------------------------------------------------------------------------------------
__device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand) __device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
{ {
ulonglong2 product; ulonglong2 product;
@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co
return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
} }
#undef MUL_SUM_XOR_DST __device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst)
__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst)
{ {
ulonglong2 d = AS_UL2(far_dst); ulonglong2 d = AS_UL2(far_dst);
ulonglong2 p = cuda_mul128(m, d.x); ulonglong2 p = cuda_mul128(m, d.x);
@ -73,8 +80,8 @@ __global__
#if __CUDA_ARCH__ >= 500 #if __CUDA_ARCH__ >= 500
//__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */ //__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
#endif #endif
void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx, void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
{ {
__shared__ __align__(16) uint32_t sharedMemory[1024]; __shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory); cn_aes_gpu_init(sharedMemory);
@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
if (thread < threads) if (thread < threads)
{ {
const uint32_t batchsize = ITER >> (2U + bfactor); const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize; const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize; const uint32_t end = start + batchsize;
@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
uint32_t j = (A.x & E2I_MASK) >> 3; uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C); cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4 AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4
MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]); MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
j = (A.x & E2I_MASK) >> 3; j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B); cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
AS_UINT4(&long_state[j]) = C ^ B; AS_UINT4(&long_state[j]) = C ^ B;
MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]); MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
} }
if (bfactor) { if (bfactor) {
@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
} }
} }
// --------------------------------------------------------------------------------------------------------------
__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
{
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
AS_UINT4(long_state) = Z;
}
__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z)
{
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1;
Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24);
AS_UINT4(long_state) = Z;
}
__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak)
{
ulonglong2 d = AS_UL2(far_dst);
ulonglong2 p = cuda_mul128(m, d.x);
p += AS_UL2(&a);
AS_UL2(&a) = p ^ d;
p.y = p.y ^ tweak;
AS_UL2(far_dst) = p;
}
__global__ __global__
void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2) void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{ {
__shared__ __align__(16) uint32_t sharedMemory[1024]; __shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory); cn_aes_gpu_init(sharedMemory);
__syncthreads(); __syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U; const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
const uint32_t sub = (threadIdx.x & 7U) << 1U; if (thread < threads)
{
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
uint64_t tweak = d_tweak[thread];
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
if(thread < threads) uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
for (int i = start; i < end; i++) // end = 262144
{
uint4 C;
uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
store_variant1(&long_state[j], C ^ B); // st.global
MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
store_variant1(&long_state[j], C ^ B);
MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
// --------------------------------------------------------------------------------------------------------------
__global__
void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{ {
const uint32_t long_oft = (thread << LONG_SHL64) + sub; const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t st_oft = (thread * 26U) + sub + 8U; const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
uint64_t tweak = d_tweak[thread];
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
for (int i = start; i < end; i++) // end = 262144
{
uint4 C;
uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
store_variant2(&long_state[j], C ^ B); // st.global
MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
store_variant2(&long_state[j], C ^ B);
MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
// --------------------------------------------------------------------------------------------------------------
uint4 key[10]; __global__
const uint32_t* ctx_key = &ctx_key2[thread * 40U]; void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state,
#pragma unroll 10 // 160 bytes uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2)
for (int i = 0; i < 10; i++) {
key[i] = AS_UINT4(&ctx_key[i*4U]); __shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
uint4 text = AS_UINT4(&ctx_state[st_oft]); if(thread < threads)
{
const int sub = (threadIdx.x & 7) << 2;
const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
uint32_t key[40], text[4];
MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U) for(int i = 0; i < LONG_LOOPS32; i += 32)
{ {
uint4 st = AS_UINT4(&long_state[long_oft + i]); #pragma unroll
text = text ^ st; for(int j = 0; j < 4; ++j)
cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key); text[j] ^= longstate[i + j];
cn_aes_pseudo_round_mut(sharedMemory, text, key);
} }
AS_UINT4(&ctx_state[st_oft]) = text; MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
} }
} }
// --------------------------------------------------------------------------------------------------------------
extern int device_bfactor[MAX_GPUS]; extern int device_bfactor[MAX_GPUS];
__host__ __host__
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{ {
dim3 grid(blocks); dim3 grid(blocks);
dim3 block(threads); dim3 block(threads);
//dim3 block2(threads << 1);
dim3 block4(threads << 2); dim3 block4(threads << 2);
dim3 block8(threads << 3); dim3 block8(threads << 3);
const uint32_t bfactor = (uint32_t) device_bfactor[thr_id]; const uint16_t bfactor = (uint16_t) device_bfactor[thr_id];
const uint32_t partcount = 1 << bfactor; const uint32_t partcount = 1U << bfactor;
const uint32_t throughput = (uint32_t) (blocks*threads); const uint32_t throughput = (uint32_t) (blocks*threads);
const int bsleep = bfactor ? 100 : 0; const int bsleep = bfactor ? 100 : 0;
const int dev_id = device_map[thr_id]; const int dev_id = device_map[thr_id];
cryptonight_core_gpu_phase1 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key1); cryptonight_gpu_phase1 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep); if(partcount > 1) usleep(bsleep);
for (uint32_t i = 0; i < partcount; i++) for (uint32_t i = 0; i < partcount; i++)
{ {
dim3 b = device_sm[dev_id] >= 300 ? block4 : block; dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
cryptonight_core_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); if (variant == 0)
cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
else if (variant == 1 || cryptonight_fork == 8)
monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
else if (variant == 2 && cryptonight_fork == 3)
stellite_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep); if(partcount > 1) usleep(bsleep);
} }
//cudaDeviceSynchronize();
cryptonight_core_gpu_phase3 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key2); //exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_gpu_phase3 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
} }

62
crypto/cryptonight-cpu.cpp

@ -12,6 +12,20 @@ extern "C" {
#include "cpu/c_keccak.h" #include "cpu/c_keccak.h"
} }
static void cryptonight_store_variant(void* state, int variant) {
if (variant == 1 || cryptonight_fork == 8) {
// monero, and graft ?
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
} else if (variant == 2 && cryptonight_fork == 3) {
// stellite
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
}
}
struct cryptonight_ctx { struct cryptonight_ctx {
uint8_t long_state[MEMORY]; uint8_t long_state[MEMORY];
union cn_slow_hash_state state; union cn_slow_hash_state state;
@ -130,14 +144,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; ((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
} }
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) { static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0]; hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi; ((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo; ((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo;
} }
static void copy_block(uint8_t* dst, const uint8_t* src) { static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -155,13 +169,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
} }
static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx) static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
{ {
size_t i, j; size_t i, j;
if (variant && len < 43)
return 0;
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND #undef RND
@ -184,14 +203,16 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
j = e2i(ctx->a) * AES_BLOCK_SIZE; j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
cryptonight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]); mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2);
j = e2i(ctx->a) * AES_BLOCK_SIZE; j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
cryptonight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]); mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2);
} }
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@ -217,11 +238,38 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
oaes_free((OAES_CTX **) &ctx->aes_ctx); oaes_free((OAES_CTX **) &ctx->aes_ctx);
return 1;
} }
void cryptonight_hash(void* output, const void* input, size_t len) int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
{ {
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
cryptonight_hash_ctx(output, input, len, ctx); int rc = cryptonight_hash_ctx(output, input, len, ctx, variant);
free(ctx); free(ctx);
return rc;
}
void cryptonight_hash(void* output, const void* input)
{
cryptonight_fork = 1;
cryptonight_hash_variant(output, input, 76, 0);
}
void graft_hash(void* output, const void* input)
{
cryptonight_fork = 8;
cryptonight_hash_variant(output, input, 76, 1);
}
void monero_hash(void* output, const void* input)
{
cryptonight_fork = 7;
cryptonight_hash_variant(output, input, 76, 1);
} }
void stellite_hash(void* output, const void* input)
{
cryptonight_fork = 3;
cryptonight_hash_variant(output, input, 76, 2);
}

175
crypto/cryptonight-extra.cu

@ -7,15 +7,15 @@
#include <miner.h> #include <miner.h>
#include <cuda_helper.h> #include <cuda_helper.h>
#include "cryptonight.h"
typedef uint8_t BitSequence; #include "cryptonight.h"
typedef uint64_t DataLength;
static uint32_t *d_input[MAX_GPUS] = { 0 }; static uint32_t *d_input[MAX_GPUS];
static uint32_t *d_target[MAX_GPUS]; static uint32_t *d_target[MAX_GPUS];
static uint32_t *d_result[MAX_GPUS]; static uint32_t *d_result[MAX_GPUS];
typedef uint8_t BitSequence;
typedef uint32_t DataLength;
#include "cn_keccak.cuh" #include "cn_keccak.cuh"
#include "cn_blake.cuh" #include "cn_blake.cuh"
#include "cn_groestl.cuh" #include "cn_groestl.cuh"
@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = {
__device__ __forceinline__ __device__ __forceinline__
void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data) void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
{ {
const uint32_t aes_gf[] = { const uint32_t aes_gf[10] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
}; };
MEMSET4(key, 0, 40);
MEMCPY4(key, data, 8); MEMCPY4(key, data, 8);
#pragma unroll #pragma unroll
for(int i = 8; i < 40; i++) for(int i = 8; i < 40; i++)
{ {
@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
} }
__global__ __global__
void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce, void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce,
uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2) uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if(thread < threads) if(thread < threads)
{ {
uint32_t ctx_state[50]; uint64_t ctx_state[25];
uint32_t ctx_a[4]; uint32_t ctx_a[4];
uint32_t ctx_b[4]; uint32_t ctx_b[4];
uint32_t ctx_key1[40]; uint32_t ctx_key1[40];
@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
uint32_t input[19]; uint32_t input[19];
MEMCPY4(input, d_input, 19); MEMCPY4(input, d_input, 19);
*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
cn_keccak((uint8_t *)input, (uint8_t *)ctx_state);
cryptonight_aes_set_key(ctx_key1, ctx_state);
cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25);
MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4);
MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4);
MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40);
MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40);
}
}
__global__ uint32_t nonce = startNonce + thread;
void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state) *(((uint8_t *)input) + 39) = nonce & 0xff;
{ *(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff;
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); *(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff;
if(thread < threads) *(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff;
{
uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]); cn_keccak(input, ctx_state);
uint64_t state[25]; MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50);
#pragma unroll
for(int i = 0; i < 25; i++) cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0]));
state[i] = ctx_state[i]; cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4]));
MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40);
cn_keccakf2(state); MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40);
// to reduce the final kernel stack frame, cut algos in 2 kernels XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a);
// ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%) XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b);
switch (((uint8_t*)state)[0] & 0x03) MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4);
{ MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4);
case 0: {
uint32_t hash[8]; if (variant) {
cn_blake((uint8_t*)state, 200, (uint8_t*)hash); uint2 tweak = AS_UINT2(&ctx_state[24]);
((uint32_t*)ctx_state)[0] = 0; //tweak.x ^= (input[8] >> 24) | (input[9] << 8);
((uint32_t*)ctx_state)[6] = hash[6]; tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543);
((uint32_t*)ctx_state)[7] = hash[7]; tweak.y ^= __byte_perm(input[9], input[10], 0x6543);
break; MEMCPY4(&d_ctx_tweak[thread], &tweak, 2);
}
case 1: {
uint32_t hash[8];
cn_groestl((BitSequence*)state, 200, (BitSequence*)hash);
((uint32_t*)ctx_state)[0] = 0;
((uint32_t*)ctx_state)[6] = hash[6];
((uint32_t*)ctx_state)[7] = hash[7];
break;
}
default: {
#pragma unroll
for(int i = 0; i < 25; i++)
ctx_state[i] = state[i];
}
} }
} }
} }
__global__ __global__
void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state, void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target,
const uint32_t* d_target, uint32_t * resNonces) uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state)
{ {
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if(thread < threads) if(thread < threads)
{ {
uint64_t* const state = &d_ctx_state[thread * 26U]; uint32_t *ctx_state = &d_ctx_state[thread * 50U];
uint32_t hash[8]; uint32_t hash[8];
switch(((uint8_t *)state)[0] & 0x03) uint32_t state[50];
{
case 0: { #pragma unroll 25
uint32_t* h32 = (uint32_t*)state; for(int i = 0; i < 50; i+=2)
hash[6] = h32[6]; AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]);
hash[7] = h32[7];
break; cn_keccakf2((uint64_t *)state);
}
case 2: { int branch = ((uint8_t *)state)[0] & 0x03;
cn_jh256((uint8_t*)state, 200, hash); if(branch == 0)
break; cn_blake((const uint8_t *)state, 200, hash);
} if(branch == 1)
case 3: { cn_groestl((const uint8_t *)state, 200, hash);
cn_skein((uint8_t*)state, 200, hash); if(branch == 2)
break; cn_jh((const uint8_t *)state, 200, hash);
} if(branch == 3)
} cn_skein((const uint8_t *)state, 200, hash);
if(hash[7] <= d_target[1] && hash[6] <= d_target[0]) if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
{ {
@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
} }
__host__ __host__
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget) void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget)
{ {
uint32_t *pTargetIn = (uint32_t*) ptarget; uint32_t *pTargetIn = (uint32_t*) ptarget;
cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice); cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice); cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
} }
__host__ __host__
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads) void cryptonight_extra_init(int thr_id)
{ {
cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t)); cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t));
cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t)); cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t));
cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t)); cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
} }
__host__ __host__
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{ {
uint32_t threadsperblock = 128; uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
cryptonight_extra_gpu_prepare <<<grid, block >>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2); cryptonight_extra_gpu_prepare <<<grid, block>>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
} }
__host__ __host__
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state) void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state)
{ {
uint32_t threadsperblock = 128; uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, (uint32_t*)d_ctx_state);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]); cryptonight_extra_gpu_final <<<grid, block>>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
} }
__host__ __host__
void cryptonight_extra_cpu_free(int thr_id) void cryptonight_extra_free(int thr_id)
{ {
if (d_input[thr_id]) { if (d_input[thr_id]) {
cudaFree(d_input[thr_id]); cudaFree(d_input[thr_id]);
@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id)
cudaFree(d_result[thr_id]); cudaFree(d_result[thr_id]);
d_input[thr_id] = NULL; d_input[thr_id] = NULL;
} }
} }

36
crypto/cryptonight.cu

@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false;
gpulog(p, thr, fmt, ##__VA_ARGS__) gpulog(p, thr, fmt, ##__VA_ARGS__)
static uint64_t *d_long_state[MAX_GPUS]; static uint64_t *d_long_state[MAX_GPUS];
static uint64_t *d_ctx_state[MAX_GPUS]; static uint32_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_key1[MAX_GPUS]; static uint32_t *d_ctx_key1[MAX_GPUS];
static uint32_t *d_ctx_key2[MAX_GPUS]; static uint32_t *d_ctx_key2[MAX_GPUS];
static uint32_t *d_ctx_text[MAX_GPUS]; static uint32_t *d_ctx_text[MAX_GPUS];
static uint64_t *d_ctx_tweak[MAX_GPUS];
static uint32_t *d_ctx_a[MAX_GPUS]; static uint32_t *d_ctx_a[MAX_GPUS];
static uint32_t *d_ctx_b[MAX_GPUS]; static uint32_t *d_ctx_b[MAX_GPUS];
static bool init[MAX_GPUS] = { 0 }; static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
{ {
int res = 0; int res = 0;
uint32_t throughput = 0; uint32_t throughput = 0;
@ -49,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id], gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
mem, device_mpcount[dev_id]); mem, device_mpcount[dev_id]);
if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
device_config[thr_id] = strdup("80x24");
}
if (device_config[thr_id]) { if (device_config[thr_id]) {
int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -70,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
exit(1); exit(1);
} }
cudaSetDevice(device_map[thr_id]); cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) { if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset(); cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@ -79,11 +84,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
} }
const size_t alloc = MEMORY * throughput; const size_t alloc = MEMORY * throughput;
cryptonight_extra_cpu_init(thr_id, throughput); cryptonight_extra_init(thr_id);
cudaMalloc(&d_long_state[thr_id], alloc); cudaMalloc(&d_long_state[thr_id], alloc);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16) cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -95,6 +100,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
exit_if_cudaerror(thr_id, __FILE__, __LINE__);
gpu_init_shown = true; gpu_init_shown = true;
init[thr_id] = true; init[thr_id] = true;
@ -107,10 +114,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget); cryptonight_extra_setData(thr_id, pdata, ptarget);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
*hashes_done = nonce - first_nonce + throughput; *hashes_done = nonce - first_nonce + throughput;
@ -121,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
memcpy(tempdata, pdata, 76); memcpy(tempdata, pdata, 76);
*tempnonceptr = resNonces[0]; *tempnonceptr = resNonces[0];
cryptonight_hash(vhash, tempdata, 76); const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget))
{ {
res = 1; res = 1;
work->nonces[0] = resNonces[0]; work->nonces[0] = resNonces[0];
@ -131,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
if(resNonces[1] != UINT32_MAX) if(resNonces[1] != UINT32_MAX)
{ {
*tempnonceptr = resNonces[1]; *tempnonceptr = resNonces[1];
cryptonight_hash(vhash, tempdata, 76); const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) {
res++; res++;
work->nonces[1] = resNonces[1]; work->nonces[1] = resNonces[1];
} else { } else {
@ -174,10 +181,11 @@ void free_cryptonight(int thr_id)
cudaFree(d_ctx_key1[thr_id]); cudaFree(d_ctx_key1[thr_id]);
cudaFree(d_ctx_key2[thr_id]); cudaFree(d_ctx_key2[thr_id]);
cudaFree(d_ctx_text[thr_id]); cudaFree(d_ctx_text[thr_id]);
cudaFree(d_ctx_tweak[thr_id]);
cudaFree(d_ctx_a[thr_id]); cudaFree(d_ctx_a[thr_id]);
cudaFree(d_ctx_b[thr_id]); cudaFree(d_ctx_b[thr_id]);
cryptonight_extra_cpu_free(thr_id); cryptonight_extra_free(thr_id);
cudaDeviceSynchronize(); cudaDeviceSynchronize();

13
crypto/cryptonight.h

@ -20,7 +20,6 @@ struct uint3 blockDim;
#define __umul64hi(a,b) a*b #define __umul64hi(a,b) a*b
#endif #endif
#define MEMORY (1U << 21) // 2 MiB / 2097152 B #define MEMORY (1U << 21) // 2 MiB / 2097152 B
#define ITER (1U << 20) // 1048576 #define ITER (1U << 20) // 1048576
#define E2I_MASK 0x1FFFF0u #define E2I_MASK 0x1FFFF0u
@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
exit(1); exit(1);
} }
} }
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads); void cryptonight_extra_init(int thr_id);
void cryptonight_extra_cpu_free(int thr_id); void cryptonight_extra_free(int thr_id);
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state); void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);

10
crypto/xmr-rpc.cpp

@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
} }
else if (opt_algo == ALGO_CRYPTOLIGHT) { else if (opt_algo == ALGO_CRYPTOLIGHT) {
int variant = 1;
uint32_t nonce = work->nonces[idnonce]; uint32_t nonce = work->nonces[idnonce];
noncestr = bin2hex((unsigned char*) &nonce, 4); noncestr = bin2hex((unsigned char*) &nonce, 4);
last_found_nonce = nonce; last_found_nonce = nonce;
cryptolight_hash(hash, data, 76); //if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
// variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
cryptolight_hash_variant(hash, data, 76, variant);
work_set_target_ratio(work, (uint32_t*) hash); work_set_target_ratio(work, (uint32_t*) hash);
} }
else if (opt_algo == ALGO_CRYPTONIGHT) { else if (opt_algo == ALGO_CRYPTONIGHT) {
int variant = 0;
uint32_t nonce = work->nonces[idnonce]; uint32_t nonce = work->nonces[idnonce];
noncestr = bin2hex((unsigned char*) &nonce, 4); noncestr = bin2hex((unsigned char*) &nonce, 4);
last_found_nonce = nonce; last_found_nonce = nonce;
cryptonight_hash(hash, data, 76); if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
cryptonight_hash_variant(hash, data, 76, variant);
work_set_target_ratio(work, (uint32_t*) hash); work_set_target_ratio(work, (uint32_t*) hash);
} }

14
miner.h

@ -279,8 +279,8 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce,
extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -575,6 +575,8 @@ extern uint32_t device_plimit[MAX_GPUS];
extern uint32_t gpus_intensity[MAX_GPUS]; extern uint32_t gpus_intensity[MAX_GPUS];
extern int opt_cudaschedule; extern int opt_cudaschedule;
extern int cryptonight_fork;
// cuda.cpp // cuda.cpp
int cuda_num_devices(); int cuda_num_devices();
void cuda_devicenames(); void cuda_devicenames();
@ -898,8 +900,12 @@ void blake2b_hash(void *output, const void *input);
void blake2s_hash(void *output, const void *input); void blake2s_hash(void *output, const void *input);
void bmw_hash(void *state, const void *input); void bmw_hash(void *state, const void *input);
void c11hash(void *output, const void *input); void c11hash(void *output, const void *input);
void cryptolight_hash(void* output, const void* input, int len); int cryptolight_hash_variant(void* output, const void* input, int len, int variant);
void cryptonight_hash(void* output, const void* input, size_t len); void cryptolight_hash(void* output, const void* input);
int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
void cryptonight_hash(void* output, const void* input);
void monero_hash(void* output, const void* input);
void stellite_hash(void* output, const void* input);
void decred_hash(void *state, const void *input); void decred_hash(void *state, const void *input);
void deephash(void *state, const void *input); void deephash(void *state, const void *input);
void luffa_hash(void *state, const void *input); void luffa_hash(void *state, const void *input);

10
util.cpp

@ -2193,10 +2193,10 @@ void print_hash_tests(void)
c11hash(&hash[0], &buf[0]); c11hash(&hash[0], &buf[0]);
printpfx("c11", hash); printpfx("c11", hash);
cryptolight_hash(&hash[0], &buf[0], 76); cryptolight_hash(&hash[0], &buf[0]);
printpfx("cryptolight", hash); printpfx("cryptolight", hash);
cryptonight_hash(&hash[0], &buf[0], 76); cryptonight_hash(&hash[0], &buf[0]);
printpfx("cryptonight", hash); printpfx("cryptonight", hash);
memset(buf, 0, 180); memset(buf, 0, 180);
@ -2246,6 +2246,9 @@ void print_hash_tests(void)
lyra2Z_hash(&hash[0], &buf[0]); lyra2Z_hash(&hash[0], &buf[0]);
printpfx("lyra2z", hash); printpfx("lyra2z", hash);
monero_hash(&hash[0], &buf[0]);
printpfx("monero", hash);
myriadhash(&hash[0], &buf[0]); myriadhash(&hash[0], &buf[0]);
printpfx("myriad", hash); printpfx("myriad", hash);
@ -2297,6 +2300,9 @@ void print_hash_tests(void)
skunk_hash(&hash[0], &buf[0]); skunk_hash(&hash[0], &buf[0]);
printpfx("skunk", hash); printpfx("skunk", hash);
stellite_hash(&hash[0], &buf[0]);
printpfx("stelitte", hash);
s3hash(&hash[0], &buf[0]); s3hash(&hash[0], &buf[0]);
printpfx("S3", hash); printpfx("S3", hash);

Loading…
Cancel
Save