diff --git a/algos.h b/algos.h index 229d8e9..c484bcc 100644 --- a/algos.h +++ b/algos.h @@ -72,6 +72,9 @@ enum sha_algos { ALGO_WHIRLPOOLX, ALGO_WILDKECCAK, ALGO_ZR5, + ALGO_MONERO, + ALGO_GRAFT, + ALGO_STELLITE, ALGO_AUTO, ALGO_COUNT }; @@ -146,6 +149,9 @@ static const char *algo_names[] = { "whirlpoolx", "wildkeccak", "zr5", + "monero", + "graft", + "stellite", "auto", /* reserved for multi algo */ "" }; @@ -206,4 +212,29 @@ static inline int algo_to_int(char* arg) return i; } +static inline int get_cryptonight_algo(int fork) +{ + int algo = ALGO_COUNT; + + switch (fork) { + case 8: + algo = ALGO_GRAFT; + break; + + case 7: + algo = ALGO_MONERO; + break; + + case 3: + algo = ALGO_STELLITE; + break; + + default: + algo = ALGO_CRYPTONIGHT; + break; + } + + return algo; +} + #endif diff --git a/ccminer.cpp b/ccminer.cpp index 00fe1cd..6521284 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -233,6 +233,8 @@ int opt_api_mcast_port = 4068; bool opt_stratum_stats = false; +int cryptonight_fork = 1; + static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ @@ -245,7 +247,7 @@ Options:\n\ blakecoin Fast Blake 256 (8 rounds)\n\ bmw BMW 256\n\ cryptolight AEON cryptonight (MEM/2)\n\ - cryptonight XMR cryptonight\n\ + cryptonight XMR cryptonight v1 (old)\n\ c11/flax X11 variant\n\ decred Decred Blake256\n\ deep Deepcoin\n\ @@ -253,6 +255,7 @@ Options:\n\ dmd-gr Diamond-Groestl\n\ fresh Freshcoin (shavite 80)\n\ fugue256 Fuguecoin\n\ + graft Cryptonight v8\n\ groestl Groestlcoin\n" #ifdef WITH_HEAVY_ALGO " heavy Heavycoin\n" @@ -267,6 +270,7 @@ Options:\n\ lyra2v2 VertCoin\n\ lyra2z ZeroCoin (3rd impl)\n\ myr-gr Myriad-Groestl\n\ + monero XMR cryptonight (v7)\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ @@ -284,6 +288,7 @@ Options:\n\ skein Skein SHA2 (Skeincoin)\n\ skein2 Double Skein (Woodcoin)\n\ skunk Skein Cube Fugue Streebog\n\ + stellite Cryptonight v3\n\ s3 S3 (1Coin)\n\ timetravel Machinecoin permuted x8\n\ tribus Denarius\n\ @@ -573,7 +578,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work); void get_currentalgo(char* buf, int sz) { - snprintf(buf, sz, "%s", algo_names[opt_algo]); + int algo = opt_algo; + if (algo == ALGO_CRYPTONIGHT) + algo = get_cryptonight_algo(cryptonight_fork); + snprintf(buf, sz, "%s", algo_names[algo]); } void format_hashrate(double hashrate, char *output) @@ -2372,11 +2380,16 @@ static void *miner_thread(void *userdata) rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done); break; case ALGO_CRYPTOLIGHT: - rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done); + rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1); break; case ALGO_CRYPTONIGHT: - rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done); + { + int cn_variant = 0; + if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork) + cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1; + rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant); break; + } case ALGO_DECRED: rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done); break; @@ -3138,6 +3151,26 @@ void parse_arg(int key, char *arg) case ALGO_SCRYPT_JANE: opt_nfactor = 14; break; } } + + // cryptonight variants + switch (opt_algo) { + case ALGO_MONERO: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 7; + break; + case ALGO_GRAFT: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 8; + break; + case ALGO_STELLITE: + opt_algo = ALGO_CRYPTONIGHT; + cryptonight_fork = 3; + break; + case ALGO_CRYPTONIGHT: + cryptonight_fork = 1; + break; + } + break; case 'b': p = strstr(arg, ":"); diff --git a/crypto/cn_aes.cuh b/crypto/cn_aes.cuh index df419b3..99ad212 100644 --- a/crypto/cn_aes.cuh +++ b/crypto/cn_aes.cuh @@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = { */ #define AS_U32(addr) *((uint32_t*)(addr)) +#define AS_U64(addr) *((uint64_t*)(addr)) #define AS_UINT2(addr) *((uint2*)(addr)) #define AS_UINT4(addr) *((uint4*)(addr)) #define AS_UL2(addr) *((ulonglong2*)(addr)) diff --git a/crypto/cn_blake.cuh b/crypto/cn_blake.cuh index 5c0d09f..bd2ba43 100644 --- a/crypto/cn_blake.cuh +++ b/crypto/cn_blake.cuh @@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest) } __device__ -void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out) +void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out) { blake_state bs; blake_state *S = (blake_state *)&bs; diff --git a/crypto/cn_groestl.cuh b/crypto/cn_groestl.cuh index 62530d4..425e062 100644 --- a/crypto/cn_groestl.cuh +++ b/crypto/cn_groestl.cuh @@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState* __restrict__ ctx, BitSequence* __restri for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) { output[j] = s[i]; } - +#if 0 for (i = 0; i < GROESTL_COLS512; i++) { ctx->chaining[i] = 0; } for (i = 0; i < GROESTL_SIZE512; i++) { ctx->buffer[i] = 0; } +#endif } __device__ @@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx) } __device__ -void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval) { DataLength databitlen = len << 3; groestlHashState context; cn_groestl_init(&context); - cn_groestl_update(&context, data, databitlen); - cn_groestl_final(&context, hashval); + cn_groestl_update(&context, (BitSequence*) data, databitlen); + cn_groestl_final(&context, (BitSequence*) hashval); } diff --git a/crypto/cn_jh.cuh b/crypto/cn_jh.cuh index c2df763..b05380d 100644 --- a/crypto/cn_jh.cuh +++ b/crypto/cn_jh.cuh @@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ databitlen = 0; } - if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) { - memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ; + if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512) ) + { + memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ); index = 64-(state->datasize_in_buffer >> 3); databitlen = databitlen - (512 - state->datasize_in_buffer); cn_jh_F8(state); @@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ /* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */ __device__ -void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval) +void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval) { unsigned int i; //uint32_t *bufptr = (uint32_t *)state->buffer; @@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv } else { - /*set the rest of the bytes in the buffer to 0*/ + /* set the rest of the bytes in the buffer to 0 */ if ( (state->datasize_in_buffer & 7) == 0) { for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0; } else { @@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv cn_jh_F8(state); } - MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8); + memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32); + //MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8); } __device__ @@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen) state->databitlen = 0; state->datasize_in_buffer = 0; state->hashbitlen = hashbitlen; - //memcpy(state->x, d_JH256_H0, 128); - MEMCPY8(state->x, d_JH256_H0, 128 / 8); + memcpy(state->x, d_JH256_H0, 128); + //MEMCPY8(state->x, d_JH256_H0, 128 / 8); } __device__ -void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval) +void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval) { const int hashbitlen = 256; DataLength databitlen = len << 3; @@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re cn_jh_init(&state, hashbitlen); cn_jh_update(&state, data, databitlen); - cn_jh_final(&state, hashval); + cn_jh_final(&state, (uint8_t*) hashval); } diff --git a/crypto/cn_keccak.cuh b/crypto/cn_keccak.cuh index 3acef7a..c6f5908 100644 --- a/crypto/cn_keccak.cuh +++ b/crypto/cn_keccak.cuh @@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s) } __device__ __forceinline__ -void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md) +void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md) { uint64_t st[25]; diff --git a/crypto/cn_skein.cuh b/crypto/cn_skein.cuh index 2096467..0e68143 100644 --- a/crypto/cn_skein.cuh +++ b/crypto/cn_skein.cuh @@ -4,19 +4,15 @@ typedef unsigned int uint_t; /* native unsigned integer */ #define SKEIN_256_STATE_WORDS ( 4) #define SKEIN_512_STATE_WORDS ( 8) -#define SKEIN1024_STATE_WORDS (16) #define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS) #define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS) #define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS) #define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) -#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS) #define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS) #define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) -#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS) #define SKEIN_MK_64(hi32,lo32) ((lo32) + (((uint64_t) (hi32)) << 32)) #define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) @@ -119,7 +115,7 @@ typedef struct { } skeinHashState; __device__ -void cn_skein256_init(skeinHashState *state, size_t hashBitLen) +void cn_skein_init(skeinHashState *state, size_t hashBitLen) { const uint64_t SKEIN_512_IV_256[] = { @@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr } __device__ -void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen) +void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen) { if ((databitlen & 7) == 0) { - cn_skein_block(&state->u.ctx_512, data, databitlen >> 3); } else { - size_t bCnt = (databitlen >> 3) + 1; uint8_t b,mask; @@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r } __device__ -void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal) +void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal) { uint64_t X[SKEIN_512_STATE_WORDS]; Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; @@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric ((uint64_t *)ctx->b)[0] = (uint64_t)i; Skein_Start_New_Type(ctx, OUT_FINAL); cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t)); - memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n); + memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n); memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time } } __device__ -void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval) +void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval) { int hashbitlen = 256; DataLength databitlen = len << 3; @@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re state.statebits = 64*SKEIN_512_STATE_WORDS; - cn_skein256_init(&state, hashbitlen); - cn_skein256_update(&state, data, databitlen); - cn_skein256_final(&state, hashval); + cn_skein_init(&state, hashbitlen); + cn_skein_update(&state, data, databitlen); + cn_skein_final(&state, (uint8_t*) hashval); } diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu index 3891768..8f0bb75 100644 --- a/crypto/cryptolight-core.cu +++ b/crypto/cryptolight-core.cu @@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * if(thread < threads) { - const int oft = thread * 52 + sub + 16; // not aligned 16! + const int oft = thread * 50 + sub + 16; // not aligned 16! const int long_oft = (thread << LONG_SHL_IDX) + sub; uint32_t __align__(16) key[40]; uint32_t __align__(16) text[4]; @@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t * } } +// -------------------------------------------------------------------------------------------------------------- + __global__ -void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) +void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) { __shared__ uint32_t __align__(16) sharedMemory[1024]; @@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int #endif // __CUDA_ARCH__ >= 300 } +__device__ __forceinline__ void store_variant1(uint32_t* long_state) +{ + uint4* Z = (uint4*) long_state; + const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773); + const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1; + Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24); +} + +#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \ + uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \ + hi += ((uint64_t *)c)[0]; \ + ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \ + ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \ + ((uint64_t *)dst)[0] = hi; \ + ((uint64_t *)dst)[1] = lo ^ tweak; } + +__global__ +void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint64_t * __restrict__ d_tweak) +{ + __shared__ __align__(16) uint32_t sharedMemory[1024]; + cn_aes_gpu_init(sharedMemory); + __syncthreads(); + + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + const uint32_t batchsize = ITER >> (2 + bfactor); + const uint32_t start = partidx * batchsize; + const uint32_t end = start + batchsize; + const uint32_t longptr = thread << LONG_SHL_IDX; + uint32_t * long_state = &d_long_state[longptr]; + uint64_t tweak = d_tweak[thread]; + + void * ctx_a = (void*)(&d_ctx_a[thread << 2]); + void * ctx_b = (void*)(&d_ctx_b[thread << 2]); + uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4 + uint4 B = AS_UINT4(ctx_b); + uint32_t* a = (uint32_t*)&A; + uint32_t* b = (uint32_t*)&B; + + for (int i = start; i < end; i++) + { + uint32_t c[4]; + uint32_t j = (A.x >> 2) & E2I_MASK2; + cn_aes_single_round(sharedMemory, &long_state[j], c, a); + XOR_BLOCKS_DST(c, b, &long_state[j]); + store_variant1(&long_state[j]); + MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak); + + j = (A.x >> 2) & E2I_MASK2; + cn_aes_single_round(sharedMemory, &long_state[j], b, a); + XOR_BLOCKS_DST(b, c, &long_state[j]); + store_variant1(&long_state[j]); + MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak); + } + if (bfactor) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; + } + } +} + __global__ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2) { @@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3 if(thread < threads) { const int long_oft = (thread << LONG_SHL_IDX) + sub; - const int oft = thread * 52 + sub + 16; + const int oft = thread * 50 + sub + 16; uint32_t __align__(16) key[40]; uint32_t __align__(16) text[4]; @@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3 extern int device_bfactor[MAX_GPUS]; __host__ -void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, - uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) +void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, + uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak) { dim3 grid(blocks); dim3 block(threads); @@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_ int i, partcount = 1 << bfactor; int dev_id = device_map[thr_id]; - cryptolight_core_gpu_phase1 <<>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1); + cryptolight_core_gpu_phase1 <<>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); for(i = 0; i < partcount; i++) { - cryptolight_core_gpu_phase2 <<= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + dim3 b = device_sm[dev_id] >= 300 ? block4 : block; + if (variant == 0) + cryptolight_old_gpu_phase2 <<>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + else + cryptolight_gpu_phase2 <<>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); } - cryptolight_core_gpu_phase3 <<>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2); + cryptolight_core_gpu_phase3 <<>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp index b0ee386..f995b4c 100644 --- a/crypto/cryptolight-cpu.cpp +++ b/crypto/cryptolight-cpu.cpp @@ -22,6 +22,16 @@ struct cryptonight_ctx { oaes_ctx* aes_ctx; }; + +static void cryptolight_store_variant(void* state, int variant) { + if (variant == 1) { + // use variant 1 like monero since june 2018 + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); + } +} + static void do_blake_hash(const void* input, int len, void* output) { uchar hash[32]; @@ -132,14 +142,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui ((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; } -static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) { +static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) { uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; hi += ((uint64_t*) c)[0]; ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) dst)[0] = hi; - ((uint64_t*) dst)[1] = lo; + ((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo; } static void copy_block(uint8_t* dst, const uint8_t* src) { @@ -157,13 +167,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; } -static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx) +static int cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant) { size_t i, j; + if (variant && len < 43) + return 0; + keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); + const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0; + oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { #undef RND @@ -186,14 +201,16 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, j = e2i(ctx->a); aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); + cryptolight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]); + mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak); j = e2i(ctx->a); aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); + cryptolight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]); + mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak); } memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); @@ -219,11 +236,19 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len, if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); oaes_free((OAES_CTX **) &ctx->aes_ctx); + return 1; } -void cryptolight_hash(void* output, const void* input, int len) +int cryptolight_hash_variant(void* output, const void* input, int len, int variant) { struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); - cryptolight_hash_ctx(output, input, len, ctx); + int rc = cryptolight_hash_ctx(output, input, len, ctx, variant); free(ctx); + return rc; } + +void cryptolight_hash(void* output, const void* input) +{ + cryptolight_hash_variant(output, input, 76, 1); +} + diff --git a/crypto/cryptolight.cu b/crypto/cryptolight.cu index c8ab8ea..c2a10e4 100644 --- a/crypto/cryptolight.cu +++ b/crypto/cryptolight.cu @@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks = 32; static __thread uint32_t cn_threads = 16; static uint32_t *d_long_state[MAX_GPUS]; -static uint64_t *d_ctx_state[MAX_GPUS]; +static uint32_t *d_ctx_state[MAX_GPUS]; static uint32_t *d_ctx_key1[MAX_GPUS]; static uint32_t *d_ctx_key2[MAX_GPUS]; static uint32_t *d_ctx_text[MAX_GPUS]; +static uint64_t *d_ctx_tweak[MAX_GPUS]; static uint32_t *d_ctx_a[MAX_GPUS]; static uint32_t *d_ctx_b[MAX_GPUS]; static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant) { int res = 0; uint32_t throughput = 0; @@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ uint32_t *nonceptr = (uint32_t*) (&pdata[39]); const uint32_t first_nonce = *nonceptr; uint32_t nonce = first_nonce; + int dev_id = device_map[thr_id]; if(opt_benchmark) { ptarget[7] = 0x00ff; @@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ if(!init[thr_id]) { + if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) { + device_config[thr_id] = strdup("80x32"); + } + if (device_config[thr_id]) { sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); @@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ } const size_t alloc = MEMORY * throughput; - cryptonight_extra_cpu_init(thr_id, throughput); + cryptonight_extra_init(thr_id); cudaMalloc(&d_long_state[thr_id], alloc); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput); + cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); @@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput); init[thr_id] = true; } @@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ const uint32_t Htarg = ptarget[7]; uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; - cryptonight_extra_cpu_setData(thr_id, pdata, ptarget); - cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); + cryptonight_extra_setData(thr_id, pdata, ptarget); + cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); *hashes_done = nonce - first_nonce + throughput; @@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); memcpy(tempdata, pdata, 76); *tempnonceptr = resNonces[0]; - cryptolight_hash(vhash, tempdata, 76); + cryptolight_hash_variant(vhash, tempdata, 76, variant); if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { res = 1; @@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_ if(resNonces[1] != UINT32_MAX) { *tempnonceptr = resNonces[1]; - cryptolight_hash(vhash, tempdata, 76); + cryptolight_hash_variant(vhash, tempdata, 76, variant); if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { res++; work->nonces[1] = resNonces[1]; @@ -157,10 +164,11 @@ void free_cryptolight(int thr_id) cudaFree(d_ctx_key1[thr_id]); cudaFree(d_ctx_key2[thr_id]); cudaFree(d_ctx_text[thr_id]); + cudaFree(d_ctx_tweak[thr_id]); cudaFree(d_ctx_a[thr_id]); cudaFree(d_ctx_b[thr_id]); - cryptonight_extra_cpu_free(thr_id); + cryptonight_extra_free(thr_id); cudaDeviceSynchronize(); diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h index 443cf5b..482d0f8 100644 --- a/crypto/cryptolight.h +++ b/crypto/cryptolight.h @@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) exit(1); } } -void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); -void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); -void cryptonight_extra_cpu_init(int thr_id, uint32_t threads); -void cryptonight_extra_cpu_free(int thr_id); -void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); -void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state); +void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); + +void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget); +void cryptonight_extra_init(int thr_id/*, uint32_t threads*/); +void cryptonight_extra_free(int thr_id); +void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); +void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state); diff --git a/crypto/cryptonight-core.cu b/crypto/cryptonight-core.cu index 4780f37..90f024f 100644 --- a/crypto/cryptonight-core.cu +++ b/crypto/cryptonight-core.cu @@ -2,47 +2,55 @@ #include #include #include +#ifndef _WIN32 #include +#endif + +#include +#include + +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#undef __shfl +#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width) +#endif #include "cryptonight.h" -#define LONG_SHL32 19 // 1<<19 +#define LONG_SHL32 19 // 1<<19 (uint32_t* index) #define LONG_SHL64 18 // 1<<18 (uint64_t* index) #define LONG_LOOPS32 0x80000U -#define LONG_LOOPS64 0x40000U #include "cn_aes.cuh" __global__ -//__launch_bounds__(128, 9) // 56 registers -void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1) +void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state, + uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1) { - __shared__ __align__(16) uint32_t sharedMemory[1024]; - cn_aes_gpu_init(sharedMemory); - __syncthreads(); + __shared__ uint32_t sharedMemory[1024]; const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; - const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14 - if(thread < threads) { - const uint32_t long_oft = (thread << LONG_SHL64) + sub; - - const uint32_t* ctx_key = &ctx_key1[thread * 40U]; - uint4 keys[10]; - #pragma unroll 10 // load 160 bytes - for (int i = 0; i < 10; i ++) - keys[i] = AS_UINT4(&ctx_key[i*4]); + cn_aes_gpu_init(sharedMemory); + __syncthreads(); - uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]); + const uint32_t sub = (threadIdx.x & 0x7U) << 2; + uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub]; + uint32_t __align__(8) key[40]; + MEMCPY8(key, &ctx_key1[thread * 40U], 20); + uint32_t __align__(8) text[4]; + MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2); - for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) { - cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys); - AS_UINT4(&long_state[long_oft + i]) = text; + for(int i = 0; i < LONG_LOOPS32; i += 32) + { + cn_aes_pseudo_round_mut(sharedMemory, text, key); + MEMCPY8(&longstate[i], text, 2); } } } +// -------------------------------------------------------------------------------------------------------------- + __device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand) { ulonglong2 product; @@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); } -#undef MUL_SUM_XOR_DST -__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst) +__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst) { ulonglong2 d = AS_UL2(far_dst); ulonglong2 p = cuda_mul128(m, d.x); @@ -73,8 +80,8 @@ __global__ #if __CUDA_ARCH__ >= 500 //__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */ #endif -void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx, - uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) +void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b) { __shared__ __align__(16) uint32_t sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); @@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, if (thread < threads) { - const uint32_t batchsize = ITER >> (2U + bfactor); + const uint32_t batchsize = ITER >> (2 + bfactor); const uint32_t start = partidx * batchsize; const uint32_t end = start + batchsize; @@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, uint32_t j = (A.x & E2I_MASK) >> 3; cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C); AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4 - MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]); + MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]); j = (A.x & E2I_MASK) >> 3; cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B); AS_UINT4(&long_state[j]) = C ^ B; - MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]); + MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]); } if (bfactor) { @@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, } } +// -------------------------------------------------------------------------------------------------------------- + +__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z) +{ + const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773); + const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1; + Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24); + AS_UINT4(long_state) = Z; +} + +__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z) +{ + const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773); + const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1; + Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24); + AS_UINT4(long_state) = Z; +} + +__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak) +{ + ulonglong2 d = AS_UL2(far_dst); + ulonglong2 p = cuda_mul128(m, d.x); + p += AS_UL2(&a); + AS_UL2(&a) = p ^ d; + p.y = p.y ^ tweak; + AS_UL2(far_dst) = p; +} + __global__ -void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2) +void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint64_t * __restrict__ d_tweak) { __shared__ __align__(16) uint32_t sharedMemory[1024]; cn_aes_gpu_init(sharedMemory); __syncthreads(); - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U; - const uint32_t sub = (threadIdx.x & 7U) << 1U; + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + const uint32_t batchsize = ITER >> (2 + bfactor); + const uint32_t start = partidx * batchsize; + const uint32_t end = start + batchsize; + uint64_t tweak = d_tweak[thread]; + + void * ctx_a = (void*)(&d_ctx_a[thread << 2]); + void * ctx_b = (void*)(&d_ctx_b[thread << 2]); + uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4 + uint4 B = AS_UINT4(ctx_b); - if(thread < threads) + uint64_t * long_state = &d_long_state[thread << LONG_SHL64]; + for (int i = start; i < end; i++) // end = 262144 + { + uint4 C; + uint32_t j = (A.x & E2I_MASK) >> 3; + cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C); + store_variant1(&long_state[j], C ^ B); // st.global + MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak); + + j = (A.x & E2I_MASK) >> 3; + cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B); + store_variant1(&long_state[j], C ^ B); + MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak); + } + if (bfactor) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; + } + } +} + +// -------------------------------------------------------------------------------------------------------------- + +__global__ +void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx, + uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint64_t * __restrict__ d_tweak) +{ + __shared__ __align__(16) uint32_t sharedMemory[1024]; + cn_aes_gpu_init(sharedMemory); + __syncthreads(); + + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) { - const uint32_t long_oft = (thread << LONG_SHL64) + sub; - const uint32_t st_oft = (thread * 26U) + sub + 8U; + const uint32_t batchsize = ITER >> (2 + bfactor); + const uint32_t start = partidx * batchsize; + const uint32_t end = start + batchsize; + uint64_t tweak = d_tweak[thread]; + + void * ctx_a = (void*)(&d_ctx_a[thread << 2]); + void * ctx_b = (void*)(&d_ctx_b[thread << 2]); + uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4 + uint4 B = AS_UINT4(ctx_b); + + uint64_t * long_state = &d_long_state[thread << LONG_SHL64]; + for (int i = start; i < end; i++) // end = 262144 + { + uint4 C; + uint32_t j = (A.x & E2I_MASK) >> 3; + cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C); + store_variant2(&long_state[j], C ^ B); // st.global + MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak); + + j = (A.x & E2I_MASK) >> 3; + cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B); + store_variant2(&long_state[j], C ^ B); + MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak); + } + if (bfactor) { + AS_UINT4(ctx_a) = A; + AS_UINT4(ctx_b) = B; + } + } +} + +// -------------------------------------------------------------------------------------------------------------- - uint4 key[10]; - const uint32_t* ctx_key = &ctx_key2[thread * 40U]; - #pragma unroll 10 // 160 bytes - for (int i = 0; i < 10; i++) - key[i] = AS_UINT4(&ctx_key[i*4U]); +__global__ +void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state, + uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init(sharedMemory); + __syncthreads(); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; - uint4 text = AS_UINT4(&ctx_state[st_oft]); + if(thread < threads) + { + const int sub = (threadIdx.x & 7) << 2; + const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub]; + uint32_t key[40], text[4]; + MEMCPY8(key, d_ctx_key2 + thread * 40, 20); + MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2); - for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U) + for(int i = 0; i < LONG_LOOPS32; i += 32) { - uint4 st = AS_UINT4(&long_state[long_oft + i]); - text = text ^ st; - cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key); + #pragma unroll + for(int j = 0; j < 4; ++j) + text[j] ^= longstate[i + j]; + + cn_aes_pseudo_round_mut(sharedMemory, text, key); } - AS_UINT4(&ctx_state[st_oft]) = text; + MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2); } } +// -------------------------------------------------------------------------------------------------------------- + extern int device_bfactor[MAX_GPUS]; __host__ -void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, - uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) +void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, + uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak) { dim3 grid(blocks); dim3 block(threads); - //dim3 block2(threads << 1); dim3 block4(threads << 2); dim3 block8(threads << 3); - const uint32_t bfactor = (uint32_t) device_bfactor[thr_id]; - const uint32_t partcount = 1 << bfactor; + const uint16_t bfactor = (uint16_t) device_bfactor[thr_id]; + const uint32_t partcount = 1U << bfactor; const uint32_t throughput = (uint32_t) (blocks*threads); const int bsleep = bfactor ? 100 : 0; const int dev_id = device_map[thr_id]; - cryptonight_core_gpu_phase1 <<>> (throughput, d_long_state, d_ctx_state, d_ctx_key1); + cryptonight_gpu_phase1 <<>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); for (uint32_t i = 0; i < partcount; i++) { dim3 b = device_sm[dev_id] >= 300 ? block4 : block; - cryptonight_core_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + if (variant == 0) + cryptonight_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + else if (variant == 1 || cryptonight_fork == 8) + monero_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak); + else if (variant == 2 && cryptonight_fork == 3) + stellite_gpu_phase2 <<>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); if(partcount > 1) usleep(bsleep); } - - cryptonight_core_gpu_phase3 <<>> (throughput, d_long_state, d_ctx_state, d_ctx_key2); + //cudaDeviceSynchronize(); + //exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cryptonight_gpu_phase3 <<>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp index 66b3cf4..b60798f 100644 --- a/crypto/cryptonight-cpu.cpp +++ b/crypto/cryptonight-cpu.cpp @@ -12,6 +12,20 @@ extern "C" { #include "cpu/c_keccak.h" } +static void cryptonight_store_variant(void* state, int variant) { + if (variant == 1 || cryptonight_fork == 8) { + // monero, and graft ? + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30); + } else if (variant == 2 && cryptonight_fork == 3) { + // stellite + const uint8_t tmp = ((const uint8_t*)(state))[11]; + const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30); + } +} + struct cryptonight_ctx { uint8_t long_state[MEMORY]; union cn_slow_hash_state state; @@ -130,14 +144,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui ((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; } -static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) { +static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak1_2) { uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; hi += ((uint64_t*) c)[0]; ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; ((uint64_t*) dst)[0] = hi; - ((uint64_t*) dst)[1] = lo; + ((uint64_t*) dst)[1] = variant ? lo ^ tweak1_2 : lo; } static void copy_block(uint8_t* dst, const uint8_t* src) { @@ -155,13 +169,18 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; } -static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx) +static int cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant) { size_t i, j; + if (variant && len < 43) + return 0; + keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); + const uint64_t tweak1_2 = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0; + oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { #undef RND @@ -184,14 +203,16 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st j = e2i(ctx->a) * AES_BLOCK_SIZE; aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); + cryptonight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]); + mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak1_2); j = e2i(ctx->a) * AES_BLOCK_SIZE; aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); + cryptonight_store_variant(&ctx->long_state[j], variant); - mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]); + mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak1_2); } memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); @@ -217,11 +238,38 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); oaes_free((OAES_CTX **) &ctx->aes_ctx); + return 1; } -void cryptonight_hash(void* output, const void* input, size_t len) +int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant) { struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); - cryptonight_hash_ctx(output, input, len, ctx); + int rc = cryptonight_hash_ctx(output, input, len, ctx, variant); free(ctx); + return rc; +} + +void cryptonight_hash(void* output, const void* input) +{ + cryptonight_fork = 1; + cryptonight_hash_variant(output, input, 76, 0); +} + +void graft_hash(void* output, const void* input) +{ + cryptonight_fork = 8; + cryptonight_hash_variant(output, input, 76, 1); +} + +void monero_hash(void* output, const void* input) +{ + cryptonight_fork = 7; + cryptonight_hash_variant(output, input, 76, 1); } + +void stellite_hash(void* output, const void* input) +{ + cryptonight_fork = 3; + cryptonight_hash_variant(output, input, 76, 2); +} + diff --git a/crypto/cryptonight-extra.cu b/crypto/cryptonight-extra.cu index 6d3c131..c55c518 100644 --- a/crypto/cryptonight-extra.cu +++ b/crypto/cryptonight-extra.cu @@ -7,15 +7,15 @@ #include #include -#include "cryptonight.h" -typedef uint8_t BitSequence; -typedef uint64_t DataLength; +#include "cryptonight.h" -static uint32_t *d_input[MAX_GPUS] = { 0 }; +static uint32_t *d_input[MAX_GPUS]; static uint32_t *d_target[MAX_GPUS]; static uint32_t *d_result[MAX_GPUS]; +typedef uint8_t BitSequence; +typedef uint32_t DataLength; #include "cn_keccak.cuh" #include "cn_blake.cuh" #include "cn_groestl.cuh" @@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = { __device__ __forceinline__ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data) { - const uint32_t aes_gf[] = { + const uint32_t aes_gf[10] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 }; - MEMSET4(key, 0, 40); MEMCPY4(key, data, 8); - #pragma unroll for(int i = 8; i < 40; i++) { @@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res } __global__ -void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce, - uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, - uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2) +void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce, + uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b, + uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if(thread < threads) { - uint32_t ctx_state[50]; + uint64_t ctx_state[25]; uint32_t ctx_a[4]; uint32_t ctx_b[4]; uint32_t ctx_key1[40]; @@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict uint32_t input[19]; MEMCPY4(input, d_input, 19); - *((uint32_t *)(((char *)input) + 39)) = startNonce + thread; - - cn_keccak((uint8_t *)input, (uint8_t *)ctx_state); - cryptonight_aes_set_key(ctx_key1, ctx_state); - cryptonight_aes_set_key(ctx_key2, ctx_state + 8); - XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a); - XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b); - - MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25); - MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4); - MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4); - MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40); - MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40); - } -} -__global__ -void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if(thread < threads) - { - uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]); - uint64_t state[25]; - #pragma unroll - for(int i = 0; i < 25; i++) - state[i] = ctx_state[i]; - - cn_keccakf2(state); - - // to reduce the final kernel stack frame, cut algos in 2 kernels - // ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%) - switch (((uint8_t*)state)[0] & 0x03) - { - case 0: { - uint32_t hash[8]; - cn_blake((uint8_t*)state, 200, (uint8_t*)hash); - ((uint32_t*)ctx_state)[0] = 0; - ((uint32_t*)ctx_state)[6] = hash[6]; - ((uint32_t*)ctx_state)[7] = hash[7]; - break; - } - case 1: { - uint32_t hash[8]; - cn_groestl((BitSequence*)state, 200, (BitSequence*)hash); - ((uint32_t*)ctx_state)[0] = 0; - ((uint32_t*)ctx_state)[6] = hash[6]; - ((uint32_t*)ctx_state)[7] = hash[7]; - break; - } - default: { - #pragma unroll - for(int i = 0; i < 25; i++) - ctx_state[i] = state[i]; - } + uint32_t nonce = startNonce + thread; + *(((uint8_t *)input) + 39) = nonce & 0xff; + *(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff; + *(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff; + *(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff; + + cn_keccak(input, ctx_state); + MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50); + + cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0])); + cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4])); + MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40); + MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40); + + XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a); + XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b); + MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4); + MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4); + + if (variant) { + uint2 tweak = AS_UINT2(&ctx_state[24]); + //tweak.x ^= (input[8] >> 24) | (input[9] << 8); + tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543); + tweak.y ^= __byte_perm(input[9], input[10], 0x6543); + MEMCPY4(&d_ctx_tweak[thread], &tweak, 2); } } } __global__ -void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state, - const uint32_t* d_target, uint32_t * resNonces) +void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target, + uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state) { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; if(thread < threads) { - uint64_t* const state = &d_ctx_state[thread * 26U]; - + uint32_t *ctx_state = &d_ctx_state[thread * 50U]; uint32_t hash[8]; - switch(((uint8_t *)state)[0] & 0x03) - { - case 0: { - uint32_t* h32 = (uint32_t*)state; - hash[6] = h32[6]; - hash[7] = h32[7]; - break; - } - case 2: { - cn_jh256((uint8_t*)state, 200, hash); - break; - } - case 3: { - cn_skein((uint8_t*)state, 200, hash); - break; - } - } + uint32_t state[50]; + + #pragma unroll 25 + for(int i = 0; i < 50; i+=2) + AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]); + + cn_keccakf2((uint64_t *)state); + + int branch = ((uint8_t *)state)[0] & 0x03; + if(branch == 0) + cn_blake((const uint8_t *)state, 200, hash); + if(branch == 1) + cn_groestl((const uint8_t *)state, 200, hash); + if(branch == 2) + cn_jh((const uint8_t *)state, 200, hash); + if(branch == 3) + cn_skein((const uint8_t *)state, 200, hash); if(hash[7] <= d_target[1] && hash[6] <= d_target[0]) { @@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui } __host__ -void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget) +void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget) { uint32_t *pTargetIn = (uint32_t*) ptarget; - cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t)); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } __host__ -void cryptonight_extra_cpu_init(int thr_id, uint32_t threads) +void cryptonight_extra_init(int thr_id) { - cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t)); - cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t)); - cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t)); + cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t)); + cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t)); + cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t)); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } __host__ -void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) +void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak) { uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cryptonight_extra_gpu_prepare <<>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2); + cryptonight_extra_gpu_prepare <<>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } __host__ -void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state) +void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state) { uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t)); - exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cryptonight_extra_gpu_keccak <<>> (threads, (uint32_t*)d_ctx_state); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cryptonight_extra_gpu_final <<>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]); + cryptonight_extra_gpu_final <<>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); } __host__ -void cryptonight_extra_cpu_free(int thr_id) +void cryptonight_extra_free(int thr_id) { if (d_input[thr_id]) { cudaFree(d_input[thr_id]); @@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id) cudaFree(d_result[thr_id]); d_input[thr_id] = NULL; } -} \ No newline at end of file +} diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index 0214ce4..5f92972 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false; gpulog(p, thr, fmt, ##__VA_ARGS__) static uint64_t *d_long_state[MAX_GPUS]; -static uint64_t *d_ctx_state[MAX_GPUS]; +static uint32_t *d_ctx_state[MAX_GPUS]; static uint32_t *d_ctx_key1[MAX_GPUS]; static uint32_t *d_ctx_key2[MAX_GPUS]; static uint32_t *d_ctx_text[MAX_GPUS]; +static uint64_t *d_ctx_tweak[MAX_GPUS]; static uint32_t *d_ctx_a[MAX_GPUS]; static uint32_t *d_ctx_b[MAX_GPUS]; static bool init[MAX_GPUS] = { 0 }; -extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant) { int res = 0; uint32_t throughput = 0; @@ -49,6 +50,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id], mem, device_mpcount[dev_id]); + if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) { + device_config[thr_id] = strdup("80x24"); + } + if (device_config[thr_id]) { int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); @@ -70,7 +75,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ exit(1); } - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); @@ -79,11 +84,11 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ } const size_t alloc = MEMORY * throughput; - cryptonight_extra_cpu_init(thr_id, throughput); + cryptonight_extra_init(thr_id); cudaMalloc(&d_long_state[thr_id], alloc); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16) + cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); @@ -95,6 +100,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput); + exit_if_cudaerror(thr_id, __FILE__, __LINE__); gpu_init_shown = true; init[thr_id] = true; @@ -107,10 +114,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ const uint32_t Htarg = ptarget[7]; uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; - cryptonight_extra_cpu_setData(thr_id, pdata, ptarget); - cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); - cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); + cryptonight_extra_setData(thr_id, pdata, ptarget); + cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]); + cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); *hashes_done = nonce - first_nonce + throughput; @@ -121,8 +128,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); memcpy(tempdata, pdata, 76); *tempnonceptr = resNonces[0]; - cryptonight_hash(vhash, tempdata, 76); - if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) + const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant); + if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) { res = 1; work->nonces[0] = resNonces[0]; @@ -131,8 +138,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ if(resNonces[1] != UINT32_MAX) { *tempnonceptr = resNonces[1]; - cryptonight_hash(vhash, tempdata, 76); - if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + const int rc = cryptonight_hash_variant(vhash, tempdata, 76, variant); + if(rc && (vhash[7] <= Htarg) && fulltest(vhash, ptarget)) { res++; work->nonces[1] = resNonces[1]; } else { @@ -174,10 +181,11 @@ void free_cryptonight(int thr_id) cudaFree(d_ctx_key1[thr_id]); cudaFree(d_ctx_key2[thr_id]); cudaFree(d_ctx_text[thr_id]); + cudaFree(d_ctx_tweak[thr_id]); cudaFree(d_ctx_a[thr_id]); cudaFree(d_ctx_b[thr_id]); - cryptonight_extra_cpu_free(thr_id); + cryptonight_extra_free(thr_id); cudaDeviceSynchronize(); diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index 4a31832..00417b9 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -20,7 +20,6 @@ struct uint3 blockDim; #define __umul64hi(a,b) a*b #endif - #define MEMORY (1U << 21) // 2 MiB / 2097152 B #define ITER (1U << 20) // 1048576 #define E2I_MASK 0x1FFFF0u @@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) exit(1); } } -void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); +void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); -void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); -void cryptonight_extra_cpu_init(int thr_id, uint32_t threads); -void cryptonight_extra_cpu_free(int thr_id); -void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); -void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state); +void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget); +void cryptonight_extra_init(int thr_id); +void cryptonight_extra_free(int thr_id); +void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak); +void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state); diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp index 82b7845..433caa7 100644 --- a/crypto/xmr-rpc.cpp +++ b/crypto/xmr-rpc.cpp @@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work) } else if (opt_algo == ALGO_CRYPTOLIGHT) { + int variant = 1; uint32_t nonce = work->nonces[idnonce]; noncestr = bin2hex((unsigned char*) &nonce, 4); last_found_nonce = nonce; - cryptolight_hash(hash, data, 76); + //if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork) + // variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1; + cryptolight_hash_variant(hash, data, 76, variant); work_set_target_ratio(work, (uint32_t*) hash); } else if (opt_algo == ALGO_CRYPTONIGHT) { + int variant = 0; uint32_t nonce = work->nonces[idnonce]; noncestr = bin2hex((unsigned char*) &nonce, 4); last_found_nonce = nonce; - cryptonight_hash(hash, data, 76); + if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork) + variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1; + cryptonight_hash_variant(hash, data, 76, variant); work_set_target_ratio(work, (uint32_t*) hash); } diff --git a/miner.h b/miner.h index 2853906..86088cb 100644 --- a/miner.h +++ b/miner.h @@ -279,8 +279,8 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant); +extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant); extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -575,6 +575,8 @@ extern uint32_t device_plimit[MAX_GPUS]; extern uint32_t gpus_intensity[MAX_GPUS]; extern int opt_cudaschedule; +extern int cryptonight_fork; + // cuda.cpp int cuda_num_devices(); void cuda_devicenames(); @@ -898,8 +900,12 @@ void blake2b_hash(void *output, const void *input); void blake2s_hash(void *output, const void *input); void bmw_hash(void *state, const void *input); void c11hash(void *output, const void *input); -void cryptolight_hash(void* output, const void* input, int len); -void cryptonight_hash(void* output, const void* input, size_t len); +int cryptolight_hash_variant(void* output, const void* input, int len, int variant); +void cryptolight_hash(void* output, const void* input); +int cryptonight_hash_variant(void* output, const void* input, size_t len, int variant); +void cryptonight_hash(void* output, const void* input); +void monero_hash(void* output, const void* input); +void stellite_hash(void* output, const void* input); void decred_hash(void *state, const void *input); void deephash(void *state, const void *input); void luffa_hash(void *state, const void *input); diff --git a/util.cpp b/util.cpp index 49cd854..9c2194d 100644 --- a/util.cpp +++ b/util.cpp @@ -2193,10 +2193,10 @@ void print_hash_tests(void) c11hash(&hash[0], &buf[0]); printpfx("c11", hash); - cryptolight_hash(&hash[0], &buf[0], 76); + cryptolight_hash(&hash[0], &buf[0]); printpfx("cryptolight", hash); - cryptonight_hash(&hash[0], &buf[0], 76); + cryptonight_hash(&hash[0], &buf[0]); printpfx("cryptonight", hash); memset(buf, 0, 180); @@ -2246,6 +2246,9 @@ void print_hash_tests(void) lyra2Z_hash(&hash[0], &buf[0]); printpfx("lyra2z", hash); + monero_hash(&hash[0], &buf[0]); + printpfx("monero", hash); + myriadhash(&hash[0], &buf[0]); printpfx("myriad", hash); @@ -2297,6 +2300,9 @@ void print_hash_tests(void) skunk_hash(&hash[0], &buf[0]); printpfx("skunk", hash); + stellite_hash(&hash[0], &buf[0]); + printpfx("stelitte", hash); + s3hash(&hash[0], &buf[0]); printpfx("S3", hash);