Browse Source

handle new cryptonight variants, stellite, aeon

special thanks for klausT changes and ystarnaud who helped me to adapt my kernel variants...

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
pull/5/head
Tanguy Pruvot 6 years ago
parent
commit
77c4b8724e
  1. 31
      algos.h
  2. 41
      ccminer.cpp
  3. 1
      crypto/cn_aes.cuh
  4. 2
      crypto/cn_blake.cuh
  5. 9
      crypto/cn_groestl.cuh
  6. 20
      crypto/cn_jh.cuh
  7. 2
      crypto/cn_keccak.cuh
  8. 22
      crypto/cn_skein.cuh
  9. 86
      crypto/cryptolight-core.cu
  10. 39
      crypto/cryptolight-cpu.cpp
  11. 30
      crypto/cryptolight.cu
  12. 13
      crypto/cryptolight.h
  13. 236
      crypto/cryptonight-core.cu
  14. 62
      crypto/cryptonight-cpu.cpp
  15. 175
      crypto/cryptonight-extra.cu
  16. 36
      crypto/cryptonight.cu
  17. 13
      crypto/cryptonight.h
  18. 10
      crypto/xmr-rpc.cpp
  19. 14
      miner.h
  20. 10
      util.cpp

31
algos.h

@ -72,6 +72,9 @@ enum sha_algos { @@ -72,6 +72,9 @@ enum sha_algos {
ALGO_WHIRLPOOLX,
ALGO_WILDKECCAK,
ALGO_ZR5,
ALGO_MONERO,
ALGO_GRAFT,
ALGO_STELLITE,
ALGO_AUTO,
ALGO_COUNT
};
@ -146,6 +149,9 @@ static const char *algo_names[] = { @@ -146,6 +149,9 @@ static const char *algo_names[] = {
"whirlpoolx",
"wildkeccak",
"zr5",
"monero",
"graft",
"stellite",
"auto", /* reserved for multi algo */
""
};
@ -206,4 +212,29 @@ static inline int algo_to_int(char* arg) @@ -206,4 +212,29 @@ static inline int algo_to_int(char* arg)
return i;
}
static inline int get_cryptonight_algo(int fork)
{
int algo = ALGO_COUNT;
switch (fork) {
case 8:
algo = ALGO_GRAFT;
break;
case 7:
algo = ALGO_MONERO;
break;
case 3:
algo = ALGO_STELLITE;
break;
default:
algo = ALGO_CRYPTONIGHT;
break;
}
return algo;
}
#endif

41
ccminer.cpp

@ -233,6 +233,8 @@ int opt_api_mcast_port = 4068; @@ -233,6 +233,8 @@ int opt_api_mcast_port = 4068;
bool opt_stratum_stats = false;
int cryptonight_fork = 1;
static char const usage[] = "\
Usage: " PROGRAM_NAME " [OPTIONS]\n\
Options:\n\
@ -245,7 +247,7 @@ Options:\n\ @@ -245,7 +247,7 @@ Options:\n\
blakecoin Fast Blake 256 (8 rounds)\n\
bmw BMW 256\n\
cryptolight AEON cryptonight (MEM/2)\n\
cryptonight XMR cryptonight\n\
cryptonight XMR cryptonight v1 (old)\n\
c11/flax X11 variant\n\
decred Decred Blake256\n\
deep Deepcoin\n\
@ -253,6 +255,7 @@ Options:\n\ @@ -253,6 +255,7 @@ Options:\n\
dmd-gr Diamond-Groestl\n\
fresh Freshcoin (shavite 80)\n\
fugue256 Fuguecoin\n\
graft Cryptonight v8\n\
groestl Groestlcoin\n"
#ifdef WITH_HEAVY_ALGO
" heavy Heavycoin\n"
@ -267,6 +270,7 @@ Options:\n\ @@ -267,6 +270,7 @@ Options:\n\
lyra2v2 VertCoin\n\
lyra2z ZeroCoin (3rd impl)\n\
myr-gr Myriad-Groestl\n\
monero XMR cryptonight (v7)\n\
neoscrypt FeatherCoin, Phoenix, UFO...\n\
nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\
@ -284,6 +288,7 @@ Options:\n\ @@ -284,6 +288,7 @@ Options:\n\
skein Skein SHA2 (Skeincoin)\n\
skein2 Double Skein (Woodcoin)\n\
skunk Skein Cube Fugue Streebog\n\
stellite Cryptonight v3\n\
s3 S3 (1Coin)\n\
timetravel Machinecoin permuted x8\n\
tribus Denarius\n\
@ -573,7 +578,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work); @@ -573,7 +578,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work);
void get_currentalgo(char* buf, int sz)
{
snprintf(buf, sz, "%s", algo_names[opt_algo]);
int algo = opt_algo;
if (algo == ALGO_CRYPTONIGHT)
algo = get_cryptonight_algo(cryptonight_fork);
snprintf(buf, sz, "%s", algo_names[algo]);
}
void format_hashrate(double hashrate, char *output)
@ -2372,11 +2380,16 @@ static void *miner_thread(void *userdata) @@ -2372,11 +2380,16 @@ static void *miner_thread(void *userdata)
rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_CRYPTOLIGHT:
rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);
rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
break;
case ALGO_CRYPTONIGHT:
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
{
int cn_variant = 0;
if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork)
cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1;
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant);
break;
}
case ALGO_DECRED:
rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
break;
@ -3138,6 +3151,26 @@ void parse_arg(int key, char *arg) @@ -3138,6 +3151,26 @@ void parse_arg(int key, char *arg)
case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
}
}
// cryptonight variants
switch (opt_algo) {
case ALGO_MONERO:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 7;
break;
case ALGO_GRAFT:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 8;
break;
case ALGO_STELLITE:
opt_algo = ALGO_CRYPTONIGHT;
cryptonight_fork = 3;
break;
case ALGO_CRYPTONIGHT:
cryptonight_fork = 1;
break;
}
break;
case 'b':
p = strstr(arg, ":");

1
crypto/cn_aes.cuh

@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = { @@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
*/
#define AS_U32(addr) *((uint32_t*)(addr))
#define AS_U64(addr) *((uint64_t*)(addr))
#define AS_UINT2(addr) *((uint2*)(addr))
#define AS_UINT4(addr) *((uint4*)(addr))
#define AS_UL2(addr) *((ulonglong2*)(addr))

2
crypto/cn_blake.cuh

@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest) @@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
}
__device__
void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out)
{
blake_state bs;
blake_state *S = (blake_state *)&bs;

9
crypto/cn_groestl.cuh

@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri @@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri
for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
output[j] = s[i];
}
#if 0
for (i = 0; i < GROESTL_COLS512; i++) {
ctx->chaining[i] = 0;
}
for (i = 0; i < GROESTL_SIZE512; i++) {
ctx->buffer[i] = 0;
}
#endif
}
__device__
@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx) @@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx)
}
__device__
void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{
DataLength databitlen = len << 3;
groestlHashState context;
cn_groestl_init(&context);
cn_groestl_update(&context, data, databitlen);
cn_groestl_final(&context, hashval);
cn_groestl_update(&context, (BitSequence*) data, databitlen);
cn_groestl_final(&context, (BitSequence*) hashval);
}

20
crypto/cn_jh.cuh

@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ @@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
databitlen = 0;
}
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) {
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) )
{
memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
index = 64-(state->datasize_in_buffer >> 3);
databitlen = databitlen - (512 - state->datasize_in_buffer);
cn_jh_F8(state);
@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ @@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
/* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
__device__
void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval)
void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval)
{
unsigned int i;
//uint32_t *bufptr = (uint32_t *)state->buffer;
@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv @@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
} else {
/*set the rest of the bytes in the buffer to 0*/
/* set the rest of the bytes in the buffer to 0 */
if ( (state->datasize_in_buffer & 7) == 0) {
for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
} else {
@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv @@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
cn_jh_F8(state);
}
MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32);
//MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
}
__device__
@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen) @@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen)
state->databitlen = 0;
state->datasize_in_buffer = 0;
state->hashbitlen = hashbitlen;
//memcpy(state->x, d_JH256_H0, 128);
MEMCPY8(state->x, d_JH256_H0, 128 / 8);
memcpy(state->x, d_JH256_H0, 128);
//MEMCPY8(state->x, d_JH256_H0, 128 / 8);
}
__device__
void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{
const int hashbitlen = 256;
DataLength databitlen = len << 3;
@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re @@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
cn_jh_init(&state, hashbitlen);
cn_jh_update(&state, data, databitlen);
cn_jh_final(&state, hashval);
cn_jh_final(&state, (uint8_t*) hashval);
}

2
crypto/cn_keccak.cuh

@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s) @@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s)
}
__device__ __forceinline__
void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md)
void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md)
{
uint64_t st[25];

22
crypto/cn_skein.cuh

@ -4,19 +4,15 @@ typedef unsigned int uint_t; /* native unsigned integer */ @@ -4,19 +4,15 @@ typedef unsigned int uint_t; /* native unsigned integer */
#define SKEIN_256_STATE_WORDS ( 4)
#define SKEIN_512_STATE_WORDS ( 8)
#define SKEIN1024_STATE_WORDS (16)
#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS)
#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32))
#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
@ -119,7 +115,7 @@ typedef struct { @@ -119,7 +115,7 @@ typedef struct {
} skeinHashState;
__device__
void cn_skein256_init(skeinHashState *state, size_t hashBitLen)
void cn_skein_init(skeinHashState *state, size_t hashBitLen)
{
const uint64_t SKEIN_512_IV_256[] =
{
@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr @@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr
}
__device__
void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
{
if ((databitlen & 7) == 0) {
cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
}
else {
size_t bCnt = (databitlen >> 3) + 1;
uint8_t b,mask;
@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r @@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r
}
__device__
void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal)
void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
{
uint64_t X[SKEIN_512_STATE_WORDS];
Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric @@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric
((uint64_t *)ctx->b)[0] = (uint64_t)i;
Skein_Start_New_Type(ctx, OUT_FINAL);
cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n);
memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n);
memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
}
}
__device__
void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
{
int hashbitlen = 256;
DataLength databitlen = len << 3;
@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re @@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
state.statebits = 64*SKEIN_512_STATE_WORDS;
cn_skein256_init(&state, hashbitlen);
cn_skein256_update(&state, data, databitlen);
cn_skein256_final(&state, hashval);
cn_skein_init(&state, hashbitlen);
cn_skein_update(&state, data, databitlen);
cn_skein_final(&state, (uint8_t*) hashval);
}

86
crypto/cryptolight-core.cu

@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * @@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
if(thread < threads)
{
const int oft = thread * 52 + sub + 16; // not aligned 16!
const int oft = thread * 50 + sub + 16; // not aligned 16!
const int long_oft = (thread << LONG_SHL_IDX) + sub;
uint32_t __align__(16) key[40];
uint32_t __align__(16) text[4];
@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * @@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
}
}
// --------------------------------------------------------------------------------------------------------------
__global__
void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
{
__shared__ uint32_t __align__(16) sharedMemory[1024];
@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int @@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
#endif // __CUDA_ARCH__ >= 300
}
__device__ __forceinline__ void store_variant1(uint32_t* long_state)
{
uint4* Z = (uint4*) long_state;
const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
}
#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
hi += ((uint64_t *)c)[0]; \
((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
((uint64_t *)dst)[0] = hi; \
((uint64_t *)dst)[1] = lo ^ tweak; }
__global__
void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
const uint32_t longptr = thread << LONG_SHL_IDX;
uint32_t * long_state = &d_long_state[longptr];
uint64_t tweak = d_tweak[thread];
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
uint32_t* a = (uint32_t*)&A;
uint32_t* b = (uint32_t*)&B;
for (int i = start; i < end; i++)
{
uint32_t c[4];
uint32_t j = (A.x >> 2) & E2I_MASK2;
cn_aes_single_round(sharedMemory, &long_state[j], c, a);
XOR_BLOCKS_DST(c, b, &long_state[j]);
store_variant1(&long_state[j]);
MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
j = (A.x >> 2) & E2I_MASK2;
cn_aes_single_round(sharedMemory, &long_state[j], b, a);
XOR_BLOCKS_DST(b, c, &long_state[j]);
store_variant1(&long_state[j]);
MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
__global__
void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
{
@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3 @@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
if(thread < threads)
{
const int long_oft = (thread << LONG_SHL_IDX) + sub;
const int oft = thread * 52 + sub + 16;
const int oft = thread * 50 + sub + 16;
uint32_t __align__(16) key[40];
uint32_t __align__(16) text[4];
@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3 @@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
extern int device_bfactor[MAX_GPUS];
__host__
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{
dim3 grid(blocks);
dim3 block(threads);
@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_ @@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
int i, partcount = 1 << bfactor;
int dev_id = device_map[thr_id];
cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1);
cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
for(i = 0; i < partcount; i++)
{
cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
if (variant == 0)
cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
else
cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
}
cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2);
cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}

39
crypto/cryptolight-cpu.cpp

@ -22,6 +22,16 @@ struct cryptonight_ctx { @@ -22,6 +22,16 @@ struct cryptonight_ctx {
oaes_ctx* aes_ctx;
};
static void cryptolight_store_variant(void* state, int variant) {
if (variant == 1) {
// use variant 1 like monero since june 2018
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
}
}
static void do_blake_hash(const void* input, int len, void* output)
{
uchar hash[32];
@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui @@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
}
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo;
((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
}
static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { @@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
}
static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx)
static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
{
size_t i, j;
if (variant && len < 43)
return 0;
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, @@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
j = e2i(ctx->a);
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
cryptolight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);
j = e2i(ctx->a);
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
cryptolight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
}
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, @@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
oaes_free((OAES_CTX **) &ctx->aes_ctx);
return 1;
}
void cryptolight_hash(void* output, const void* input, int len)
int cryptolight_hash_variant(void* output, const void* input, int len, int variant)
{
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
cryptolight_hash_ctx(output, input, len, ctx);
int rc = cryptolight_hash_ctx(output, input, len, ctx, variant);
free(ctx);
return rc;
}
void cryptolight_hash(void* output, const void* input)
{
cryptolight_hash_variant(output, input, 76, 1);
}

30
crypto/cryptolight.cu

@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks = 32; @@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks = 32;
static __thread uint32_t cn_threads = 16;
static uint32_t *d_long_state[MAX_GPUS];
static uint64_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_key1[MAX_GPUS];
static uint32_t *d_ctx_key2[MAX_GPUS];
static uint32_t *d_ctx_text[MAX_GPUS];
static uint64_t *d_ctx_tweak[MAX_GPUS];
static uint32_t *d_ctx_a[MAX_GPUS];
static uint32_t *d_ctx_b[MAX_GPUS];
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
{
int res = 0;
uint32_t throughput = 0;
@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
const uint32_t first_nonce = *nonceptr;
uint32_t nonce = first_nonce;
int dev_id = device_map[thr_id];
if(opt_benchmark) {
ptarget[7] = 0x00ff;
@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
if(!init[thr_id])
{
if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
device_config[thr_id] = strdup("80x32");
}
if (device_config[thr_id]) {
sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
}
const size_t alloc = MEMORY * throughput;
cryptonight_extra_cpu_init(thr_id, throughput);
cryptonight_extra_init(thr_id);
cudaMalloc(&d_long_state[thr_id], alloc);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
init[thr_id] = true;
}
@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
const uint32_t Htarg = ptarget[7];
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
cryptonight_extra_setData(thr_id, pdata, ptarget);
cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
*hashes_done = nonce - first_nonce + throughput;
@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
memcpy(tempdata, pdata, 76);
*tempnonceptr = resNonces[0];
cryptolight_hash(vhash, tempdata, 76);
cryptolight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
{
res = 1;
@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ @@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
if(resNonces[1] != UINT32_MAX)
{
*tempnonceptr = resNonces[1];
cryptolight_hash(vhash, tempdata, 76);
cryptolight_hash_variant(vhash, tempdata, 76, variant);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
res++;
work->nonces[1] = resNonces[1];
@ -157,10 +164,11 @@ void free_cryptolight(int thr_id) @@ -157,10 +164,11 @@ void free_cryptolight(int thr_id)
cudaFree(d_ctx_key1[thr_id]);
cudaFree(d_ctx_key2[thr_id]);
cudaFree(d_ctx_text[thr_id]);
cudaFree(d_ctx_tweak[thr_id]);
cudaFree(d_ctx_a[thr_id]);
cudaFree(d_ctx_b[thr_id]);
cryptonight_extra_cpu_free(thr_id);
cryptonight_extra_free(thr_id);
cudaDeviceSynchronize();

13
crypto/cryptolight.h

@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) @@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
exit(1);
}
}
void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
void cryptonight_extra_cpu_free(int thr_id);
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
void cryptonight_extra_free(int thr_id);
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);

236
crypto/cryptonight-core.cu

@ -2,47 +2,55 @@ @@ -2,47 +2,55 @@
#include <stdint.h>
#include <string.h>
#include <sys/time.h>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <cuda.h>
#include <cuda_runtime.h>
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
#undef __shfl
#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
#endif
#include "cryptonight.h"
#define LONG_SHL32 19 // 1<<19
#define LONG_SHL32 19 // 1<<19 (uint32_t* index)
#define LONG_SHL64 18 // 1<<18 (uint64_t* index)
#define LONG_LOOPS32 0x80000U
#define LONG_LOOPS64 0x40000U
#include "cn_aes.cuh"
__global__
//__launch_bounds__(128, 9) // 56 registers
void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1)
void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state,
uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
__shared__ uint32_t sharedMemory[1024];
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14
if(thread < threads)
{
const uint32_t long_oft = (thread << LONG_SHL64) + sub;
const uint32_t* ctx_key = &ctx_key1[thread * 40U];
uint4 keys[10];
#pragma unroll 10 // load 160 bytes
for (int i = 0; i < 10; i ++)
keys[i] = AS_UINT4(&ctx_key[i*4]);
cn_aes_gpu_init(sharedMemory);
__syncthreads();
uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]);
const uint32_t sub = (threadIdx.x & 0x7U) << 2;
uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
uint32_t __align__(8) key[40];
MEMCPY8(key, &ctx_key1[thread * 40U], 20);
uint32_t __align__(8) text[4];
MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2);
for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) {
cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys);
AS_UINT4(&long_state[long_oft + i]) = text;
for(int i = 0; i < LONG_LOOPS32; i += 32)
{
cn_aes_pseudo_round_mut(sharedMemory, text, key);
MEMCPY8(&longstate[i], text, 2);
}
}
}
// --------------------------------------------------------------------------------------------------------------
__device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
{
ulonglong2 product;
@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co @@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co
return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
}
#undef MUL_SUM_XOR_DST
__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst)
__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst)
{
ulonglong2 d = AS_UL2(far_dst);
ulonglong2 p = cuda_mul128(m, d.x);
@ -73,8 +80,8 @@ __global__ @@ -73,8 +80,8 @@ __global__
#if __CUDA_ARCH__ >= 500
//__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
#endif
void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, @@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
if (thread < threads)
{
const uint32_t batchsize = ITER >> (2U + bfactor);
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, @@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4
MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
AS_UINT4(&long_state[j]) = C ^ B;
MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
}
if (bfactor) {
@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, @@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
}
}
// --------------------------------------------------------------------------------------------------------------
__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
{
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
AS_UINT4(long_state) = Z;
}
__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z)
{
const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1;
Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24);
AS_UINT4(long_state) = Z;
}
__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak)
{
ulonglong2 d = AS_UL2(far_dst);
ulonglong2 p = cuda_mul128(m, d.x);
p += AS_UL2(&a);
AS_UL2(&a) = p ^ d;
p.y = p.y ^ tweak;
AS_UL2(far_dst) = p;
}
__global__
void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2)
void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U;
const uint32_t sub = (threadIdx.x & 7U) << 1U;
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
uint64_t tweak = d_tweak[thread];
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
if(thread < threads)
uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
for (int i = start; i < end; i++) // end = 262144
{
uint4 C;
uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
store_variant1(&long_state[j], C ^ B); // st.global
MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
store_variant1(&long_state[j], C ^ B);
MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
// --------------------------------------------------------------------------------------------------------------
__global__
void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint64_t * __restrict__ d_tweak)
{
__shared__ __align__(16) uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if (thread < threads)
{
const uint32_t long_oft = (thread << LONG_SHL64) + sub;
const uint32_t st_oft = (thread * 26U) + sub + 8U;
const uint32_t batchsize = ITER >> (2 + bfactor);
const uint32_t start = partidx * batchsize;
const uint32_t end = start + batchsize;
uint64_t tweak = d_tweak[thread];
void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
uint4 B = AS_UINT4(ctx_b);
uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
for (int i = start; i < end; i++) // end = 262144
{
uint4 C;
uint32_t j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
store_variant2(&long_state[j], C ^ B); // st.global
MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
j = (A.x & E2I_MASK) >> 3;
cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
store_variant2(&long_state[j], C ^ B);
MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
}
if (bfactor) {
AS_UINT4(ctx_a) = A;
AS_UINT4(ctx_b) = B;
}
}
}
// --------------------------------------------------------------------------------------------------------------
uint4 key[10];
const uint32_t* ctx_key = &ctx_key2[thread * 40U];
#pragma unroll 10 // 160 bytes
for (int i = 0; i < 10; i++)
key[i] = AS_UINT4(&ctx_key[i*4U]);
__global__
void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state,
uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2)
{
__shared__ uint32_t sharedMemory[1024];
cn_aes_gpu_init(sharedMemory);
__syncthreads();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
uint4 text = AS_UINT4(&ctx_state[st_oft]);
if(thread < threads)
{
const int sub = (threadIdx.x & 7) << 2;
const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
uint32_t key[40], text[4];
MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);
for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U)
for(int i = 0; i < LONG_LOOPS32; i += 32)
{
uint4 st = AS_UINT4(&long_state[long_oft + i]);
text = text ^ st;
cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key);
#pragma unroll
for(int j = 0; j < 4; ++j)
text[j] ^= longstate[i + j];
cn_aes_pseudo_round_mut(sharedMemory, text, key);
}
AS_UINT4(&ctx_state[st_oft]) = text;
MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
}
}
// --------------------------------------------------------------------------------------------------------------
extern int device_bfactor[MAX_GPUS];
__host__
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{
dim3 grid(blocks);
dim3 block(threads);
//dim3 block2(threads << 1);
dim3 block4(threads << 2);
dim3 block8(threads << 3);
const uint32_t bfactor = (uint32_t) device_bfactor[thr_id];
const uint32_t partcount = 1 << bfactor;
const uint16_t bfactor = (uint16_t) device_bfactor[thr_id];
const uint32_t partcount = 1U << bfactor;
const uint32_t throughput = (uint32_t) (blocks*threads);
const int bsleep = bfactor ? 100 : 0;
const int dev_id = device_map[thr_id];
cryptonight_core_gpu_phase1 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key1);
cryptonight_gpu_phase1 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
for (uint32_t i = 0; i < partcount; i++)
{
dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
cryptonight_core_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
if (variant == 0)
cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
else if (variant == 1 || cryptonight_fork == 8)
monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
else if (variant == 2 && cryptonight_fork == 3)
stellite_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
if(partcount > 1) usleep(bsleep);
}
cryptonight_core_gpu_phase3 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key2);
//cudaDeviceSynchronize();
//exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_gpu_phase3 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}

62
crypto/cryptonight-cpu.cpp

@ -12,6 +12,20 @@ extern "C" { @@ -12,6 +12,20 @@ extern "C" {
#include "cpu/c_keccak.h"
}
static void cryptonight_store_variant(void* state, int variant) {
if (variant == 1 || cryptonight_fork == 8) {
// monero, and graft ?
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
} else if (variant == 2 && cryptonight_fork == 3) {
// stellite
const uint8_t tmp = ((const uint8_t*)(state))[11];
const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
}
}
struct cryptonight_ctx {
uint8_t long_state[MEMORY];
union cn_slow_hash_state state;
@ -130,14 +144,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui @@ -130,14 +144,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
}
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) {
uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
hi += ((uint64_t*) c)[0];
((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
((uint64_t*) dst)[0] = hi;
((uint64_t*) dst)[1] = lo;
((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo;
}
static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -155,13 +169,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { @@ -155,13 +169,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
}
static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx)
static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
{
size_t i, j;
if (variant && len < 43)
return 0;
keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
#undef RND
@ -184,14 +203,16 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st @@ -184,14 +203,16 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
cryptonight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2);
j = e2i(ctx->a) * AES_BLOCK_SIZE;
aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
cryptonight_store_variant(&ctx->long_state[j], variant);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2);
}
memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
@ -217,11 +238,38 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st @@ -217,11 +238,38 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo);
oaes_free((OAES_CTX **) &ctx->aes_ctx);
return 1;
}
void cryptonight_hash(void* output, const void* input, size_t len)
int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
{
struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
cryptonight_hash_ctx(output, input, len, ctx);
int rc = cryptonight_hash_ctx(output, input, len, ctx, variant);
free(ctx);
return rc;
}
void cryptonight_hash(void* output, const void* input)
{
cryptonight_fork = 1;
cryptonight_hash_variant(output, input, 76, 0);
}
void graft_hash(void* output, const void* input)
{
cryptonight_fork = 8;
cryptonight_hash_variant(output, input, 76, 1);
}
void monero_hash(void* output, const void* input)
{
cryptonight_fork = 7;
cryptonight_hash_variant(output, input, 76, 1);
}
void stellite_hash(void* output, const void* input)
{
cryptonight_fork = 3;
cryptonight_hash_variant(output, input, 76, 2);
}

175
crypto/cryptonight-extra.cu

@ -7,15 +7,15 @@ @@ -7,15 +7,15 @@
#include <miner.h>
#include <cuda_helper.h>
#include "cryptonight.h"
typedef uint8_t BitSequence;
typedef uint64_t DataLength;
#include "cryptonight.h"
static uint32_t *d_input[MAX_GPUS] = { 0 };
static uint32_t *d_input[MAX_GPUS];
static uint32_t *d_target[MAX_GPUS];
static uint32_t *d_result[MAX_GPUS];
typedef uint8_t BitSequence;
typedef uint32_t DataLength;
#include "cn_keccak.cuh"
#include "cn_blake.cuh"
#include "cn_groestl.cuh"
@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = { @@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = {
__device__ __forceinline__
void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
{
const uint32_t aes_gf[] = {
const uint32_t aes_gf[10] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
};
MEMSET4(key, 0, 40);
MEMCPY4(key, data, 8);
#pragma unroll
for(int i = 8; i < 40; i++)
{
@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res @@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
}
__global__
void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2)
void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce,
uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if(thread < threads)
{
uint32_t ctx_state[50];
uint64_t ctx_state[25];
uint32_t ctx_a[4];
uint32_t ctx_b[4];
uint32_t ctx_key1[40];
@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict @@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
uint32_t input[19];
MEMCPY4(input, d_input, 19);
*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
cn_keccak((uint8_t *)input, (uint8_t *)ctx_state);
cryptonight_aes_set_key(ctx_key1, ctx_state);
cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25);
MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4);
MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4);
MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40);
MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40);
}
}
__global__
void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if(thread < threads)
{
uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]);
uint64_t state[25];
#pragma unroll
for(int i = 0; i < 25; i++)
state[i] = ctx_state[i];
cn_keccakf2(state);
// to reduce the final kernel stack frame, cut algos in 2 kernels
// ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%)
switch (((uint8_t*)state)[0] & 0x03)
{
case 0: {
uint32_t hash[8];
cn_blake((uint8_t*)state, 200, (uint8_t*)hash);
((uint32_t*)ctx_state)[0] = 0;
((uint32_t*)ctx_state)[6] = hash[6];
((uint32_t*)ctx_state)[7] = hash[7];
break;
}
case 1: {
uint32_t hash[8];
cn_groestl((BitSequence*)state, 200, (BitSequence*)hash);
((uint32_t*)ctx_state)[0] = 0;
((uint32_t*)ctx_state)[6] = hash[6];
((uint32_t*)ctx_state)[7] = hash[7];
break;
}
default: {
#pragma unroll
for(int i = 0; i < 25; i++)
ctx_state[i] = state[i];
}
uint32_t nonce = startNonce + thread;
*(((uint8_t *)input) + 39) = nonce & 0xff;
*(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff;
*(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff;
*(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff;
cn_keccak(input, ctx_state);
MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50);
cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0]));
cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4]));
MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40);
MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40);
XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a);
XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b);
MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4);
MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4);
if (variant) {
uint2 tweak = AS_UINT2(&ctx_state[24]);
//tweak.x ^= (input[8] >> 24) | (input[9] << 8);
tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543);
tweak.y ^= __byte_perm(input[9], input[10], 0x6543);
MEMCPY4(&d_ctx_tweak[thread], &tweak, 2);
}
}
}
__global__
void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state,
const uint32_t* d_target, uint32_t * resNonces)
void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target,
uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
if(thread < threads)
{
uint64_t* const state = &d_ctx_state[thread * 26U];
uint32_t *ctx_state = &d_ctx_state[thread * 50U];
uint32_t hash[8];
switch(((uint8_t *)state)[0] & 0x03)
{
case 0: {
uint32_t* h32 = (uint32_t*)state;
hash[6] = h32[6];
hash[7] = h32[7];
break;
}
case 2: {
cn_jh256((uint8_t*)state, 200, hash);
break;
}
case 3: {
cn_skein((uint8_t*)state, 200, hash);
break;
}
}
uint32_t state[50];
#pragma unroll 25
for(int i = 0; i < 50; i+=2)
AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]);
cn_keccakf2((uint64_t *)state);
int branch = ((uint8_t *)state)[0] & 0x03;
if(branch == 0)
cn_blake((const uint8_t *)state, 200, hash);
if(branch == 1)
cn_groestl((const uint8_t *)state, 200, hash);
if(branch == 2)
cn_jh((const uint8_t *)state, 200, hash);
if(branch == 3)
cn_skein((const uint8_t *)state, 200, hash);
if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
{
@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui @@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
}
__host__
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget)
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget)
{
uint32_t *pTargetIn = (uint32_t*) ptarget;
cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
void cryptonight_extra_init(int thr_id)
{
cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t));
cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t));
cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t));
cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t));
cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t));
cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
{
uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cryptonight_extra_gpu_prepare <<<grid, block >>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2);
cryptonight_extra_gpu_prepare <<<grid, block>>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state)
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state)
{
uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, (uint32_t*)d_ctx_state);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]);
cryptonight_extra_gpu_final <<<grid, block>>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
}
__host__
void cryptonight_extra_cpu_free(int thr_id)
void cryptonight_extra_free(int thr_id)
{
if (d_input[thr_id]) {
cudaFree(d_input[thr_id]);
@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id) @@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id)
cudaFree(d_result[thr_id]);
d_input[thr_id] = NULL;
}
}
}

36
crypto/cryptonight.cu

@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false; @@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false;
gpulog(p, thr, fmt, ##__VA_ARGS__)
static uint64_t *d_long_state[MAX_GPUS];
static uint64_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_state[MAX_GPUS];
static uint32_t *d_ctx_key1[MAX_GPUS];
static uint32_t *d_ctx_key2[MAX_GPUS];
static uint32_t *d_ctx_text[MAX_GPUS];
static uint64_t *d_ctx_tweak[MAX_GPUS];
static uint32_t *d_ctx_a[MAX_GPUS];
static uint32_t *d_ctx_b[MAX_GPUS];
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
{
int res = 0;
uint32_t throughput = 0;
@ -49,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -49,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
mem, device_mpcount[dev_id]);
if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
device_config[thr_id] = strdup("80x24");
}
if (device_config[thr_id]) {
int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -70,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -70,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
exit(1);
}
cudaSetDevice(device_map[thr_id]);
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@ -79,11 +84,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -79,11 +84,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
}
const size_t alloc = MEMORY * throughput;
cryptonight_extra_cpu_init(thr_id, throughput);
cryptonight_extra_init(thr_id);
cudaMalloc(&d_long_state[thr_id], alloc);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16)
cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -95,6 +100,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -95,6 +100,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
exit_if_cudaerror(thr_id, __FILE__, __LINE__);
gpu_init_shown = true;
init[thr_id] = true;
@ -107,10 +114,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -107,10 +114,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
const uint32_t Htarg = ptarget[7];
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
cryptonight_extra_setData(thr_id, pdata, ptarget);
cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
*hashes_done = nonce - first_nonce + throughput;
@ -121,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -121,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
memcpy(tempdata, pdata, 76);
*tempnonceptr = resNonces[0];
cryptonight_hash(vhash, tempdata, 76);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget))
{
res = 1;
work->nonces[0] = resNonces[0];
@ -131,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ @@ -131,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
if(resNonces[1] != UINT32_MAX)
{
*tempnonceptr = resNonces[1];
cryptonight_hash(vhash, tempdata, 76);
if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant);
if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) {
res++;
work->nonces[1] = resNonces[1];
} else {
@ -174,10 +181,11 @@ void free_cryptonight(int thr_id) @@ -174,10 +181,11 @@ void free_cryptonight(int thr_id)
cudaFree(d_ctx_key1[thr_id]);
cudaFree(d_ctx_key2[thr_id]);
cudaFree(d_ctx_text[thr_id]);
cudaFree(d_ctx_tweak[thr_id]);
cudaFree(d_ctx_a[thr_id]);
cudaFree(d_ctx_b[thr_id]);
cryptonight_extra_cpu_free(thr_id);
cryptonight_extra_free(thr_id);
cudaDeviceSynchronize();

13
crypto/cryptonight.h

@ -20,7 +20,6 @@ struct uint3 blockDim; @@ -20,7 +20,6 @@ struct uint3 blockDim;
#define __umul64hi(a,b) a*b
#endif
#define MEMORY (1U << 21) // 2 MiB / 2097152 B
#define ITER (1U << 20) // 1048576
#define E2I_MASK 0x1FFFF0u
@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) @@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
exit(1);
}
}
void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
void cryptonight_extra_cpu_free(int thr_id);
void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
void cryptonight_extra_init(int thr_id);
void cryptonight_extra_free(int thr_id);
void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);

10
crypto/xmr-rpc.cpp

@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work) @@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
}
else if (opt_algo == ALGO_CRYPTOLIGHT) {
int variant = 1;
uint32_t nonce = work->nonces[idnonce];
noncestr = bin2hex((unsigned char*) &nonce, 4);
last_found_nonce = nonce;
cryptolight_hash(hash, data, 76);
//if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
// variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
cryptolight_hash_variant(hash, data, 76, variant);
work_set_target_ratio(work, (uint32_t*) hash);
}
else if (opt_algo == ALGO_CRYPTONIGHT) {
int variant = 0;
uint32_t nonce = work->nonces[idnonce];
noncestr = bin2hex((unsigned char*) &nonce, 4);
last_found_nonce = nonce;
cryptonight_hash(hash, data, 76);
if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
cryptonight_hash_variant(hash, data, 76, variant);
work_set_target_ratio(work, (uint32_t*) hash);
}

14
miner.h

@ -279,8 +279,8 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, @@ -279,8 +279,8 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce,
extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -575,6 +575,8 @@ extern uint32_t device_plimit[MAX_GPUS]; @@ -575,6 +575,8 @@ extern uint32_t device_plimit[MAX_GPUS];
extern uint32_t gpus_intensity[MAX_GPUS];
extern int opt_cudaschedule;
extern int cryptonight_fork;
// cuda.cpp
int cuda_num_devices();
void cuda_devicenames();
@ -898,8 +900,12 @@ void blake2b_hash(void *output, const void *input); @@ -898,8 +900,12 @@ void blake2b_hash(void *output, const void *input);
void blake2s_hash(void *output, const void *input);
void bmw_hash(void *state, const void *input);
void c11hash(void *output, const void *input);
void cryptolight_hash(void* output, const void* input, int len);
void cryptonight_hash(void* output, const void* input, size_t len);
int cryptolight_hash_variant(void* output, const void* input, int len, int variant);
void cryptolight_hash(void* output, const void* input);
int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
void cryptonight_hash(void* output, const void* input);
void monero_hash(void* output, const void* input);
void stellite_hash(void* output, const void* input);
void decred_hash(void *state, const void *input);
void deephash(void *state, const void *input);
void luffa_hash(void *state, const void *input);

10
util.cpp

@ -2193,10 +2193,10 @@ void print_hash_tests(void) @@ -2193,10 +2193,10 @@ void print_hash_tests(void)
c11hash(&hash[0], &buf[0]);
printpfx("c11", hash);
cryptolight_hash(&hash[0], &buf[0], 76);
cryptolight_hash(&hash[0], &buf[0]);
printpfx("cryptolight", hash);
cryptonight_hash(&hash[0], &buf[0], 76);
cryptonight_hash(&hash[0], &buf[0]);
printpfx("cryptonight", hash);
memset(buf, 0, 180);
@ -2246,6 +2246,9 @@ void print_hash_tests(void) @@ -2246,6 +2246,9 @@ void print_hash_tests(void)
lyra2Z_hash(&hash[0], &buf[0]);
printpfx("lyra2z", hash);
monero_hash(&hash[0], &buf[0]);
printpfx("monero", hash);
myriadhash(&hash[0], &buf[0]);
printpfx("myriad", hash);
@ -2297,6 +2300,9 @@ void print_hash_tests(void) @@ -2297,6 +2300,9 @@ void print_hash_tests(void)
skunk_hash(&hash[0], &buf[0]);
printpfx("skunk", hash);
stellite_hash(&hash[0], &buf[0]);
printpfx("stelitte", hash);
s3hash(&hash[0], &buf[0]);
printpfx("S3", hash);

Loading…
Cancel
Save