diff --git a/Makefile.am b/Makefile.am index fc7f075..a607f52 100644 --- a/Makefile.am +++ b/Makefile.am @@ -40,7 +40,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ Algo256/blake2s.cu sph/blake2s.c \ Algo256/bmw.cu Algo256/cuda_bmw.cu \ crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \ - crypto/cryptonight.cu crypto/cuda_cryptonight_core.cu crypto/cuda_cryptonight_extra.cu \ + crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \ + crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \ crypto/cryptonight-cpu.cpp crypto/oaes_lib.cpp crypto/aesb.cpp crypto/cpu/c_keccak.c \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ @@ -107,7 +108,10 @@ Algo256/blake256.o: Algo256/blake256.cu Algo256/cuda_bmw.o: Algo256/cuda_bmw.cu $(NVCC) $(nvcc_FLAGS) --maxrregcount=76 -o $@ -c $< -crypto/cuda_cryptonight_extra.o: crypto/cuda_cryptonight_extra.cu +crypto/cryptonight-core.o: crypto/cryptonight-core.cu + $(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $< + +crypto/cryptonight-extra.o: crypto/cryptonight-extra.cu $(NVCC) $(nvcc_FLAGS) -o $@ -c $< heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu diff --git a/README.txt b/README.txt index 74557df..9202260 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccminer 2.0 (December 2016) "Boolberry's WildKeccak RPC 2.0" +ccminer 2.0 (January 2016) "Cryptonight & other RPC 2.0 algos" --------------------------------------------------------------- *************************************************************** @@ -8,8 +8,12 @@ If you find this tool useful and like to support its continuous tpruvot@github: BTC : 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo - DRK : XeVrkPrWB7pDbdFLfKhF1Z3xpqhsx6wkH3 - ZRC : ZEcubH2xp2mpuwxMjy7wZThr5AzLGu3mqT + DCR : DsUCcACGcyP8McNMRXQwbtpDxaVUYLDQDeU + LBC : bKe6pLqELL3HHSbpJXxSdn5RrY2bfrkRhF + +Alexis: + BTC : 14EgXD7fPYD4sHBXWUi46VeiTVXNq765B8 + XVC : Vr5oCen8NrY6ekBWFaaWjCUFBH4dyiS57W DJM34: BTC donation address: 1NENYmxwZGHsKFmyjTc5WferTn5VTFb7Ze @@ -52,6 +56,8 @@ Vanilla (Blake256 8-rounds - double sha256) Vertcoin Lyra2RE Ziftrcoin (ZR5) Boolberry (Wild Keccak) +Monero (Cryptonight) +Aeon (Cryptonight-light) where some of these coins have a VERY NOTABLE nVidia advantage over competing AMD (OpenCL Only) implementations. @@ -74,6 +80,8 @@ its command line interface and options. blakecoin use to mine Old Blake 256 blake2s use to mine Nevacoin (Blake2-S 256) bmw use to mine Midnight + cryptolight use to mine AEON cryptonight (MEM/2) + cryptonight use to mine XMR cryptonight c11/flax use to mine Chaincoin and Flax decred use to mine Decred 180 bytes Blake256-14 deep use to mine Deepcoin @@ -186,9 +194,10 @@ Scrypt specific options: --no-autotune disable auto-tuning of kernel launch parameters -Boolberry specific: +XMR and Wildkeccak specific: -l, --launch-config gives the launch configuration for each kernel in a comma separated list, one per device. +Wildkeccak specific: -k, --scratchpad url Url used to download the scratchpad cache. diff --git a/algos.h b/algos.h index 9db1a13..f90aa92 100644 --- a/algos.h +++ b/algos.h @@ -10,6 +10,7 @@ enum sha_algos { ALGO_BLAKE2S, ALGO_BMW, ALGO_C11, + ALGO_CRYPTOLIGHT, ALGO_CRYPTONIGHT, ALGO_DEEP, ALGO_DECRED, @@ -63,6 +64,7 @@ static const char *algo_names[] = { "blake2s", "bmw", "c11", + "cryptolight", "cryptonight", "deep", "decred", @@ -123,6 +125,8 @@ static inline int algo_to_int(char* arg) // some aliases... if (!strcasecmp("all", arg)) i = ALGO_AUTO; + else if (!strcasecmp("cryptonight-light", arg)) + i = ALGO_CRYPTOLIGHT; else if (!strcasecmp("flax", arg)) i = ALGO_C11; else if (!strcasecmp("diamond", arg)) diff --git a/bench.cpp b/bench.cpp index 7e045dc..4c5203d 100644 --- a/bench.cpp +++ b/bench.cpp @@ -48,6 +48,7 @@ void algo_free_all(int thr_id) free_blake2s(thr_id); free_bmw(thr_id); free_c11(thr_id); + free_cryptolight(thr_id); free_cryptonight(thr_id); free_decred(thr_id); free_deep(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 1e6d104..02ad780 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -223,6 +223,8 @@ Options:\n\ blake2s Blake2-S 256 (NEVA)\n\ blakecoin Fast Blake 256 (8 rounds)\n\ bmw BMW 256\n\ + cryptolight AEON cryptonight (MEM/2)\n\ + cryptonight XMR cryptonight\n\ c11/flax X11 variant\n\ decred Decred Blake256\n\ deep Deepcoin\n\ @@ -620,6 +622,7 @@ static bool work_decode(const json_t *val, struct work *work) data_size = 80; adata_sz = data_size / 4; break; + case ALGO_CRYPTOLIGHT: case ALGO_CRYPTONIGHT: case ALGO_WILDKECCAK: return rpc2_job_decode(val, work); @@ -1767,7 +1770,7 @@ static void *miner_thread(void *userdata) nonceptr = (uint32_t*) (((char*)work.data) + 1); wcmpoft = 2; wcmplen = 32; - } else if (opt_algo == ALGO_CRYPTONIGHT) { + } else if (opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_CRYPTONIGHT) { nonceptr = (uint32_t*) (((char*)work.data) + 39); wcmplen = 39; } @@ -1802,7 +1805,7 @@ static void *miner_thread(void *userdata) extrajob = false; if (stratum_gen_work(&stratum, &g_work)) g_work_time = time(NULL); - if (opt_algo == ALGO_CRYPTONIGHT) + if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) nonceptr[0] += 0x100000; } } else { @@ -1845,7 +1848,7 @@ static void *miner_thread(void *userdata) wcmplen -= 4; } - if (opt_algo == ALGO_CRYPTONIGHT) { + if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) { uint32_t oldpos = nonceptr[0]; if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) { memcpy(&work, &g_work, sizeof(struct work)); @@ -2097,6 +2100,7 @@ static void *miner_thread(void *userdata) case ALGO_VELTOR: minmax = 0x80000; break; + case ALGO_CRYPTOLIGHT: case ALGO_CRYPTONIGHT: case ALGO_SCRYPT_JANE: minmax = 0x1000; @@ -2160,6 +2164,9 @@ static void *miner_thread(void *userdata) case ALGO_C11: rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_CRYPTOLIGHT: + rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_CRYPTONIGHT: rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done); break; @@ -2306,6 +2313,7 @@ static void *miner_thread(void *userdata) // todo: update all algos to use work->nonces and pdata[19] as counter switch (opt_algo) { case ALGO_BLAKE2S: + case ALGO_CRYPTOLIGHT: case ALGO_CRYPTONIGHT: case ALGO_DECRED: case ALGO_LBRY: @@ -2830,7 +2838,7 @@ static void show_usage_and_exit(int status) if (opt_algo == ALGO_SCRYPT || opt_algo == ALGO_SCRYPT_JANE) { printf(scrypt_usage); } - if (opt_algo == ALGO_WILDKECCAK || opt_algo == ALGO_CRYPTONIGHT) { + if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT || opt_algo == ALGO_WILDKECCAK) { printf(xmr_usage); } proper_exit(status); @@ -3685,7 +3693,7 @@ int main(int argc, char *argv[]) allow_mininginfo = false; } - if (opt_algo == ALGO_CRYPTONIGHT) { + if (opt_algo == ALGO_CRYPTONIGHT || opt_algo == ALGO_CRYPTOLIGHT) { rpc2_init(); if (!opt_quiet) applog(LOG_INFO, "Using JSON-RPC 2.0"); } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index ed7c026..1e6d96a 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -235,6 +235,7 @@ + @@ -265,13 +266,15 @@ 76 - - 128 + + + 64 - - 128 + + + 64 - + 255 @@ -335,8 +338,9 @@ - + + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 9dc7dc2..9e943b3 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -823,13 +823,19 @@ Source Files\sia + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + Source Files\CUDA\xmr - + Source Files\CUDA\xmr - + Source Files\CUDA\xmr @@ -851,4 +857,4 @@ Ressources - \ No newline at end of file + diff --git a/crypto/cryptolight-core.cu b/crypto/cryptolight-core.cu new file mode 100644 index 0000000..b8ccc6d --- /dev/null +++ b/crypto/cryptolight-core.cu @@ -0,0 +1,274 @@ +#include +#include +#include +#include +#include + +#include +#include + +#include "cryptolight.h" +#define LONG_SHL_IDX 18 +#define LONG_LOOPS32 0x40000 + +#ifdef WIN32 /* todo: --interactive */ +static __thread int cn_bfactor = 8; +static __thread int cn_bsleep = 100; +#else +static __thread int cn_bfactor = 0; +static __thread int cn_bsleep = 0; +#endif + +#include "cn_aes.cuh" + +#define MUL_SUM_XOR_DST(a,c,dst) { \ + uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \ + hi += ((uint64_t *)c)[0]; \ + ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \ + ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \ + ((uint64_t *)dst)[0] = hi; \ + ((uint64_t *)dst)[1] = lo; } + +__device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) +{ + *product_hi = __umul64hi(multiplier, multiplicand); + return(multiplier * multiplicand); +} + +__global__ +void cryptolight_core_gpu_phase1(int threads, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init(sharedMemory); + + const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + const int sub = (threadIdx.x & 7) << 2; + + if(thread < threads) + { + uint32_t key[40], text[4]; + + MEMCPY8(key, ctx_key1 + thread * 40, 20); + MEMCPY8(text, ctx_state + thread * 50 + sub + 16, 2); + + __syncthreads(); + for(int i = 0; i < LONG_LOOPS32; i += 32) + { + cn_aes_pseudo_round_mut(sharedMemory, text, key); + MEMCPY8(&long_state[(thread << LONG_SHL_IDX) + sub + i], text, 2); + } + } +} + +__global__ +void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init(sharedMemory); + + __syncthreads(); + +#if 0 && __CUDA_ARCH__ >= 300 + + const int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2; + const int sub = threadIdx.x & 3; + + if(thread < threads) + { + const int batchsize = ITER >> (2 + bfactor); + const int start = partidx * batchsize; + const int end = start + batchsize; + uint32_t * __restrict__ long_state = &d_long_state[thread << LONG_SHL_IDX]; + uint32_t * __restrict__ ctx_a = d_ctx_a + thread * 4; + uint32_t * __restrict__ ctx_b = d_ctx_b + thread * 4; + uint32_t a, b, c, x[4]; + uint32_t t1[4], t2[4], res; + uint64_t reshi, reslo; + int j; + + a = ctx_a[sub]; + b = ctx_b[sub]; + + #pragma unroll 8 + for(int i = start; i < end; ++i) + { + //j = ((uint32_t *)a)[0] & 0xFFFF0; + j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2; + + //cn_aes_single_round(sharedMemory, &long_state[j], c, a); + x[0] = long_state[j + sub]; + x[1] = __shfl((int)x[0], sub + 1, 4); + x[2] = __shfl((int)x[0], sub + 2, 4); + x[3] = __shfl((int)x[0], sub + 3, 4); + c = a ^ + t_fn0(x[0] & 0xff) ^ + t_fn1((x[1] >> 8) & 0xff) ^ + t_fn2((x[2] >> 16) & 0xff) ^ + t_fn3((x[3] >> 24) & 0xff); + + //XOR_BLOCKS_DST(c, b, &long_state[j]); + long_state[j + sub] = c ^ b; + + //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0xFFFF0]); + j = (__shfl((int)c, 0, 4) & E2I_MASK1) >> 2; + #pragma unroll + for(int k = 0; k < 2; k++) + t1[k] = __shfl((int)c, k, 4); + #pragma unroll + for(int k = 0; k < 4; k++) + t2[k] = __shfl((int)a, k, 4); + asm( + "mad.lo.u64 %0, %2, %3, %4;\n\t" + "mad.hi.u64 %1, %2, %3, %5;\n\t" + : "=l"(reslo), "=l"(reshi) + : "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0])); + res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0); + a = long_state[j + sub] ^ res; + long_state[j + sub] = res; + + //j = ((uint32_t *)a)[0] & 0xFFFF0; + j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2; + + //cn_aes_single_round(sharedMemory, &long_state[j], b, a); + x[0] = long_state[j + sub]; + x[1] = __shfl((int)x[0], sub + 1, 4); + x[2] = __shfl((int)x[0], sub + 2, 4); + x[3] = __shfl((int)x[0], sub + 3, 4); + b = a ^ + t_fn0(x[0] & 0xff) ^ + t_fn1((x[1] >> 8) & 0xff) ^ + t_fn2((x[2] >> 16) & 0xff) ^ + t_fn3((x[3] >> 24) & 0xff); + + //XOR_BLOCKS_DST(b, c, &long_state[j]); + long_state[j + sub] = c ^ b; + + //MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0xFFFF0]); + j = (__shfl((int)b, 0, 4) & E2I_MASK1) >> 2; + + #pragma unroll + for(int k = 0; k < 2; k++) + t1[k] = __shfl((int)b, k, 4); + + #pragma unroll + for(int k = 0; k < 4; k++) + t2[k] = __shfl((int)a, k, 4); + asm( + "mad.lo.u64 %0, %2, %3, %4;\n\t" + "mad.hi.u64 %1, %2, %3, %5;\n\t" + : "=l"(reslo), "=l"(reshi) + : "l"(((uint64_t *)t1)[0]), "l"(((uint64_t *)long_state)[j >> 1]), "l"(((uint64_t *)t2)[1]), "l"(((uint64_t *)t2)[0])); + res = (sub & 2 ? reslo : reshi) >> (sub & 1 ? 32 : 0); + a = long_state[j + sub] ^ res; + long_state[j + sub] = res; + } + + if(bfactor > 0) + { + ctx_a[sub] = a; + ctx_b[sub] = b; + } + } + +#else // __CUDA_ARCH__ < 300 + + const int thread = blockDim.x * blockIdx.x + threadIdx.x; + + if(thread < threads) + { + const int batchsize = ITER >> (2 + bfactor); + const int start = partidx * batchsize; + const int end = start + batchsize; + const off_t longptr = (off_t) thread << LONG_SHL_IDX; + uint32_t * long_state = &d_long_state[longptr]; + uint32_t * ctx_a = &d_ctx_a[thread * 4]; + uint32_t * ctx_b = &d_ctx_b[thread * 4]; + uint32_t a[4], b[4]; + + MEMCPY8(a, ctx_a, 2); + MEMCPY8(b, ctx_b, 2); + + for(int i = start; i < end; i++) // end = 262144 + { + uint32_t c[4]; + uint32_t j = (a[0] >> 2) & E2I_MASK2; + cn_aes_single_round(sharedMemory, &long_state[j], c, a); + XOR_BLOCKS_DST(c, b, &long_state[j]); + MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & E2I_MASK2]); + + j = (a[0] >> 2) & E2I_MASK2; + cn_aes_single_round(sharedMemory, &long_state[j], b, a); + XOR_BLOCKS_DST(b, c, &long_state[j]); + MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]); + } + + if(bfactor > 0) + { + MEMCPY8(ctx_a, a, 2); + MEMCPY8(ctx_b, b, 2); + } + } + +#endif // __CUDA_ARCH__ >= 300 +} + +__global__ +void cryptolight_core_gpu_phase3(int threads, const uint32_t * __restrict__ long_state, uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_key2) +{ + __shared__ uint32_t sharedMemory[1024]; + + cn_aes_gpu_init(sharedMemory); + + int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + int sub = (threadIdx.x & 7) << 2; + + if(thread < threads) + { + uint32_t key[40], text[4]; + MEMCPY8(key, d_ctx_key2 + thread * 40, 20); + MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2); + + __syncthreads(); + for(int i = 0; i < LONG_LOOPS32; i += 32) + { + #pragma unroll + for(int j = 0; j < 4; j++) + text[j] ^= long_state[(thread << LONG_SHL_IDX) + sub + i + j]; + + cn_aes_pseudo_round_mut(sharedMemory, text, key); + } + + MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2); + } +} + +__host__ +void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) +{ + dim3 grid(blocks); + dim3 block(threads); + dim3 block4(threads << 2); + dim3 block8(threads << 3); + + const int bfactor = cn_bfactor; // device_bfactor[thr_id]; + const int bsleep = cn_bsleep; //device_bsleep[thr_id]; + + int i, partcount = 1 << bfactor; + int dev_id = device_map[thr_id]; + + cryptolight_core_gpu_phase1 <<>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + if(partcount > 1) usleep(bsleep); + + for(i = 0; i < partcount; i++) + { + cryptolight_core_gpu_phase2 <<= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + if(partcount > 1) usleep(bsleep); + } + + cryptolight_core_gpu_phase3 <<>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); +} diff --git a/crypto/cryptolight-cpu.cpp b/crypto/cryptolight-cpu.cpp new file mode 100644 index 0000000..b0ee386 --- /dev/null +++ b/crypto/cryptolight-cpu.cpp @@ -0,0 +1,229 @@ +#include +#include + +#include "oaes_lib.h" +#include "cryptolight.h" + +extern "C" { +#include +#include +#include +#include +#include "cpu/c_keccak.h" +} + +struct cryptonight_ctx { + uint8_t long_state[MEMORY]; + union cn_slow_hash_state state; + uint8_t text[INIT_SIZE_BYTE]; + uint8_t a[AES_BLOCK_SIZE]; + uint8_t b[AES_BLOCK_SIZE]; + uint8_t c[AES_BLOCK_SIZE]; + oaes_ctx* aes_ctx; +}; + +static void do_blake_hash(const void* input, int len, void* output) +{ + uchar hash[32]; + sph_blake256_context ctx; + sph_blake256_set_rounds(14); + sph_blake256_init(&ctx); + sph_blake256(&ctx, input, len); + sph_blake256_close(&ctx, hash); + memcpy(output, hash, 32); +} + +static void do_groestl_hash(const void* input, int len, void* output) +{ + uchar hash[32]; + sph_groestl256_context ctx; + sph_groestl256_init(&ctx); + sph_groestl256(&ctx, input, len); + sph_groestl256_close(&ctx, hash); + memcpy(output, hash, 32); +} + +static void do_jh_hash(const void* input, int len, void* output) +{ + uchar hash[64]; + sph_jh256_context ctx; + sph_jh256_init(&ctx); + sph_jh256(&ctx, input, len); + sph_jh256_close(&ctx, hash); + memcpy(output, hash, 32); +} + +static void do_skein_hash(const void* input, int len, void* output) +{ + uchar hash[32]; + sph_skein256_context ctx; + sph_skein256_init(&ctx); + sph_skein256(&ctx, input, len); + sph_skein256_close(&ctx, hash); + memcpy(output, hash, 32); +} + +// todo: use sph if possible +static void keccak_hash_permutation(union hash_state *state) { + keccakf((uint64_t*)state, 24); +} + +static void keccak_hash_process(union hash_state *state, const uint8_t *buf, int count) { + keccak1600(buf, (int)count, (uint8_t*)state); +} + +extern "C" int fast_aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey); +extern "C" int aesb_single_round(const uint8_t *in, uint8_t*out, const uint8_t *expandedKey); +extern "C" int aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey); +extern "C" int fast_aesb_pseudo_round_mut(uint8_t *val, uint8_t *expandedKey); + +static void (* const extra_hashes[4])(const void*, int, void *) = { + do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash +}; + +static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) +{ + // multiplier = ab = a * 2^32 + b + // multiplicand = cd = c * 2^32 + d + // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d + uint64_t a = hi_dword(multiplier); + uint64_t b = lo_dword(multiplier); + uint64_t c = hi_dword(multiplicand); + uint64_t d = lo_dword(multiplicand); + + uint64_t ac = a * c; + uint64_t ad = a * d; + uint64_t bc = b * c; + uint64_t bd = b * d; + + uint64_t adbc = ad + bc; + uint64_t adbc_carry = adbc < ad ? 1 : 0; + + // multiplier * multiplicand = product_hi * 2^64 + product_lo + uint64_t product_lo = bd + (adbc << 32); + uint64_t product_lo_carry = product_lo < bd ? 1 : 0; + *product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry; + + return product_lo; +} + +static size_t e2i(const uint8_t* a) { + //const uint32_t mask = (MEMORY / AES_BLOCK_SIZE - 1); + //return (*((uint64_t*) a) / AES_BLOCK_SIZE) & mask; + return *((uint64_t*) a) & 0xFFFF0; /* mask * AES_BLOCK_SIZE */ +} + +static void mul(const uint8_t* a, const uint8_t* b, uint8_t* res) { + ((uint64_t*) res)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) res); +} + +static void sum_half_blocks(uint8_t* a, const uint8_t* b) { + ((uint64_t*) a)[0] += ((uint64_t*) b)[0]; + ((uint64_t*) a)[1] += ((uint64_t*) b)[1]; +} + +static void sum_half_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { + ((uint64_t*) dst)[0] = ((uint64_t*) a)[0] + ((uint64_t*) b)[0]; + ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] + ((uint64_t*) b)[1]; +} + +static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, uint8_t* dst) { + ((uint64_t*) dst)[1] = mul128(((uint64_t*) a)[0], ((uint64_t*) b)[0], (uint64_t*) dst) + ((uint64_t*) c)[1]; + ((uint64_t*) dst)[0] += ((uint64_t*) c)[0]; +} + +static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) { + uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1]; + hi += ((uint64_t*) c)[0]; + + ((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi; + ((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo; + ((uint64_t*) dst)[0] = hi; + ((uint64_t*) dst)[1] = lo; +} + +static void copy_block(uint8_t* dst, const uint8_t* src) { + ((uint64_t*) dst)[0] = ((uint64_t*) src)[0]; + ((uint64_t*) dst)[1] = ((uint64_t*) src)[1]; +} + +static void xor_blocks(uint8_t* a, const uint8_t* b) { + ((uint64_t*) a)[0] ^= ((uint64_t*) b)[0]; + ((uint64_t*) a)[1] ^= ((uint64_t*) b)[1]; +} + +static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) { + ((uint64_t*) dst)[0] = ((uint64_t*) a)[0] ^ ((uint64_t*) b)[0]; + ((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1]; +} + +static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx) +{ + size_t i, j; + keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len); + ctx->aes_ctx = (oaes_ctx*) oaes_alloc(); + memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); + + oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE); + for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { +#undef RND +#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data); + RND(0); + RND(1); + RND(2); + RND(3); + RND(4); + RND(5); + RND(6); + RND(7); + memcpy(&ctx->long_state[i], ctx->text, INIT_SIZE_BYTE); + } + + xor_blocks_dst(&ctx->state.k[0], &ctx->state.k[32], ctx->a); + xor_blocks_dst(&ctx->state.k[16], &ctx->state.k[48], ctx->b); + + for (i = 0; likely(i < ITER / 4); ++i) { + j = e2i(ctx->a); + aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a); + xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]); + + mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]); + + j = e2i(ctx->a); + aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a); + xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]); + + mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]); + } + + memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE); + oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE); + for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) { +#undef RND +#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \ + aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data); + RND(0); + RND(1); + RND(2); + RND(3); + RND(4); + RND(5); + RND(6); + RND(7); + } + memcpy(ctx->state.init, ctx->text, INIT_SIZE_BYTE); + keccak_hash_permutation(&ctx->state.hs); + + int extra_algo = ctx->state.hs.b[0] & 3; + extra_hashes[extra_algo](&ctx->state, 200, output); + if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); + + oaes_free((OAES_CTX **) &ctx->aes_ctx); +} + +void cryptolight_hash(void* output, const void* input, int len) +{ + struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx)); + cryptolight_hash_ctx(output, input, len, ctx); + free(ctx); +} diff --git a/crypto/cryptolight.cu b/crypto/cryptolight.cu new file mode 100644 index 0000000..cb1008a --- /dev/null +++ b/crypto/cryptolight.cu @@ -0,0 +1,166 @@ + +#include "cryptolight.h" + +extern char *device_config[MAX_GPUS]; // -l 32x16 + +static __thread uint32_t cn_blocks = 32; +static __thread uint32_t cn_threads = 16; + +static uint32_t *d_long_state[MAX_GPUS]; +static uint32_t *d_ctx_state[MAX_GPUS]; +static uint32_t *d_ctx_key1[MAX_GPUS]; +static uint32_t *d_ctx_key2[MAX_GPUS]; +static uint32_t *d_ctx_text[MAX_GPUS]; +static uint32_t *d_ctx_a[MAX_GPUS]; +static uint32_t *d_ctx_b[MAX_GPUS]; + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + int res = 0; + uint32_t throughput = 0; + + uint32_t *ptarget = work->target; + uint8_t *pdata = (uint8_t*) work->data; + uint32_t *nonceptr = (uint32_t*) (&pdata[39]); + const uint32_t first_nonce = *nonceptr; + uint32_t nonce = first_nonce; + + if(opt_benchmark) { + ptarget[7] = 0x00ff; + } + + if(!init[thr_id]) + { + if (device_config[thr_id]) { + sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads); + throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); + gpulog(LOG_INFO, thr_id, "Using %u x %u kernel launch config, %u threads", + cn_blocks, cn_threads, throughput); + } else { + throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); + if (throughput != cn_blocks*cn_threads && cn_threads) { + cn_blocks = throughput / cn_threads; + throughput = cn_threads * cn_blocks; + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u threads (%ux%u)", + throughput2intensity(throughput), throughput, cn_blocks, cn_threads); + } + + if(sizeof(size_t) == 4 && throughput > UINT32_MAX / MEMORY) { + gpulog(LOG_ERR, thr_id, "THE 32bit VERSION CAN'T ALLOCATE MORE THAN 4GB OF MEMORY!"); + gpulog(LOG_ERR, thr_id, "PLEASE REDUCE THE NUMBER OF THREADS OR BLOCKS"); + exit(1); + } + + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_LOG_ERROR(); + } + + const size_t alloc = MEMORY * throughput; + cryptonight_extra_cpu_init(thr_id, throughput); + + cudaMalloc(&d_long_state[thr_id], alloc); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_key2[thr_id], 40 * sizeof(uint32_t) * throughput); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_text[thr_id], 32 * sizeof(uint32_t) * throughput); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_a[thr_id], 4 * sizeof(uint32_t) * throughput); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput); + exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); + + init[thr_id] = true; + } + + throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); + + do + { + const uint32_t Htarg = ptarget[7]; + uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; + + cryptonight_extra_cpu_setData(thr_id, pdata, ptarget); + cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); + cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]); + cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]); + + *hashes_done = nonce - first_nonce + throughput; + + if(resNonces[0] != UINT32_MAX) + { + uint32_t vhash[8]; + uint32_t tempdata[19]; + uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39); + memcpy(tempdata, pdata, 76); + *tempnonceptr = resNonces[0]; + cryptolight_hash(vhash, tempdata, 76); + if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) + { + res = 1; + work->nonces[0] = resNonces[0]; + work_set_target_ratio(work, vhash); + // second nonce + if(resNonces[1] != UINT32_MAX) + { + *tempnonceptr = resNonces[1]; + cryptolight_hash(vhash, tempdata, 76); + if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + res++; + work->nonces[1] = resNonces[1]; + } else if (vhash[7] > Htarg) { + gpulog(LOG_WARNING, thr_id, "result for second nonce %08x does not validate on CPU!", resNonces[1]); + } + } + goto done; + } else if (vhash[7] > Htarg) { + gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", resNonces[0]); + } + } + + if ((uint64_t) throughput + nonce >= max_nonce - 127) { + nonce = max_nonce; + break; + } + + nonce += throughput; + gpulog(LOG_DEBUG, thr_id, "nonce %08x", nonce); + + } while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + nonce); + +done: + gpulog(LOG_DEBUG, thr_id, "nonce %08x exit", nonce); + work->valid_nonces = res; + *nonceptr = nonce; + return res; +} + +void free_cryptolight(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaFree(d_long_state[thr_id]); + cudaFree(d_ctx_state[thr_id]); + cudaFree(d_ctx_key1[thr_id]); + cudaFree(d_ctx_key2[thr_id]); + cudaFree(d_ctx_text[thr_id]); + cudaFree(d_ctx_a[thr_id]); + cudaFree(d_ctx_b[thr_id]); + + cryptonight_extra_cpu_free(thr_id); + + cudaDeviceSynchronize(); + + init[thr_id] = false; +} diff --git a/crypto/cryptolight.h b/crypto/cryptolight.h new file mode 100644 index 0000000..415e9a8 --- /dev/null +++ b/crypto/cryptolight.h @@ -0,0 +1,141 @@ +#pragma once +#include +#include + +#ifdef __INTELLISENSE__ +/* avoid red underlining */ +#define __CUDA_ARCH__ 520 +struct uint3 { + unsigned int x, y, z; +}; +struct uint3 threadIdx; +struct uint3 blockIdx; +struct uint3 blockDim; +#define atomicExch(p,y) (*p) = y +#define __funnelshift_r(a,b,c) 1 +#define __syncthreads() +#define asm(x) +#define __shfl(a,b,c) 1 +#endif + +#define MEMORY (1UL << 20) /* 1 MiB - 1048576 */ +#define ITER (1UL << 19) /* 512k */ +#define E2I_MASK1 0xFFFF0 /* MEMORY / AES_BLOCK_SIZE - 1 = 0xFFFF */ +#define E2I_MASK2 0x3FFFC /* 0xFFFF0 >> 2 */ + +#define AES_BLOCK_SIZE 16 +#define AES_KEY_SIZE 32 +#define INIT_SIZE_BLK 8 +#define INIT_SIZE_BYTE (INIT_SIZE_BLK * AES_BLOCK_SIZE) // 128 B + +#define AES_RKEY_LEN 4 +#define AES_COL_LEN 4 +#define AES_ROUND_BASE 7 + +#ifndef HASH_SIZE +#define HASH_SIZE 32 +#endif + +#ifndef HASH_DATA_AREA +#define HASH_DATA_AREA 136 +#endif + +#define hi_dword(x) (x >> 32) +#define lo_dword(x) (x & 0xFFFFFFFF) + +#define C32(x) ((uint32_t)(x ## U)) +#define T32(x) ((x) & C32(0xFFFFFFFF)) + +#ifndef ROTL64 + #if __CUDA_ARCH__ >= 350 + __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int offset) { + uint2 result; + if(offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + } else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); + } + return __double_as_longlong(__hiloint2double(result.y, result.x)); + } + #define ROTL64(x, n) (cuda_ROTL64(x, n)) + #else + #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) + #endif +#endif + +#ifndef ROTL32 + #if __CUDA_ARCH__ < 350 + #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n)))) + #else + #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) + #endif +#endif + +#ifndef ROTR32 + #if __CUDA_ARCH__ < 350 + #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) + #else + #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) + #endif +#endif + +#define MEMSET8(dst,what,cnt) { \ + int i_memset8; \ + uint64_t *out_memset8 = (uint64_t *)(dst); \ + for( i_memset8 = 0; i_memset8 < cnt; i_memset8++ ) \ + out_memset8[i_memset8] = (what); } + +#define MEMSET4(dst,what,cnt) { \ + int i_memset4; \ + uint32_t *out_memset4 = (uint32_t *)(dst); \ + for( i_memset4 = 0; i_memset4 < cnt; i_memset4++ ) \ + out_memset4[i_memset4] = (what); } + +#define MEMCPY8(dst,src,cnt) { \ + int i_memcpy8; \ + uint64_t *in_memcpy8 = (uint64_t *)(src); \ + uint64_t *out_memcpy8 = (uint64_t *)(dst); \ + for( i_memcpy8 = 0; i_memcpy8 < cnt; i_memcpy8++ ) \ + out_memcpy8[i_memcpy8] = in_memcpy8[i_memcpy8]; } + +#define MEMCPY4(dst,src,cnt) { \ + int i_memcpy4; \ + uint32_t *in_memcpy4 = (uint32_t *)(src); \ + uint32_t *out_memcpy4 = (uint32_t *)(dst); \ + for( i_memcpy4 = 0; i_memcpy4 < cnt; i_memcpy4++ ) \ + out_memcpy4[i_memcpy4] = in_memcpy4[i_memcpy4]; } + +#define XOR_BLOCKS_DST(x,y,z) { \ + ((uint64_t *)z)[0] = ((uint64_t *)(x))[0] ^ ((uint64_t *)(y))[0]; \ + ((uint64_t *)z)[1] = ((uint64_t *)(x))[1] ^ ((uint64_t *)(y))[1]; } + +union hash_state { + uint8_t b[200]; + uint64_t w[25]; +}; + +union cn_slow_hash_state { + union hash_state hs; + struct { + uint8_t k[64]; + uint8_t init[INIT_SIZE_BYTE]; + }; +}; + +static inline void exit_if_cudaerror(int thr_id, const char *src, int line) +{ + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) { + gpulog(LOG_ERR, thr_id, "%s %s line %d", cudaGetErrorString(err), src, line); + exit(1); + } +} +void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); + +void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); +void cryptonight_extra_cpu_init(int thr_id, uint32_t threads); +void cryptonight_extra_cpu_free(int thr_id); +void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); +void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint32_t *d_ctx_state); diff --git a/crypto/cuda_cryptonight_core.cu b/crypto/cryptonight-core.cu similarity index 92% rename from crypto/cuda_cryptonight_core.cu rename to crypto/cryptonight-core.cu index 27b2ac6..e39b9bc 100644 --- a/crypto/cuda_cryptonight_core.cu +++ b/crypto/cryptonight-core.cu @@ -8,14 +8,15 @@ #include #include "cryptonight.h" +#define LONG_SHL_IDX 19 #ifdef WIN32 // to prevent ui freeze -int cn_bfactor = 8; -int cn_bsleep = 100; +static __thread int cn_bfactor = 8; +static __thread int cn_bsleep = 100; #else -int cn_bfactor = 0; -int cn_bsleep = 0; +static __thread int cn_bfactor = 0; +static __thread int cn_bsleep = 0; #endif #include "cn_aes.cuh" @@ -94,7 +95,7 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int for(int i = start; i < end; ++i) { //j = ((uint32_t *)a)[0] & 0x1FFFF0; - j = (__shfl((int)a, 0, 4) & 0x1FFFF0) >> 2; + j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2; //cn_aes_single_round(sharedMemory, &long_state[j], c, a); x[0] = long_state[j + sub]; @@ -111,7 +112,7 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int long_state[j + sub] = c ^ b; //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]); - j = (__shfl((int)c, 0, 4) & 0x1FFFF0) >> 2; + j = (__shfl((int)c, 0, 4) & E2I_MASK1) >> 2; #pragma unroll for(int k = 0; k < 2; k++) t1[k] = __shfl((int)c, k, 4); @@ -128,7 +129,7 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int long_state[j + sub] = res; //j = ((uint32_t *)a)[0] & 0x1FFFF0; - j = (__shfl((int)a, 0, 4) & 0x1FFFF0) >> 2; + j = (__shfl((int)a, 0, 4) & E2I_MASK1) >> 2; //cn_aes_single_round(sharedMemory, &long_state[j], b, a); x[0] = long_state[j + sub]; @@ -145,7 +146,7 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int long_state[j + sub] = c ^ b; //MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]); - j = (__shfl((int)b, 0, 4) & 0x1FFFF0) >> 2; + j = (__shfl((int)b, 0, 4) & E2I_MASK1) >> 2; #pragma unroll for(int k = 0; k < 2; k++) @@ -192,15 +193,15 @@ void cryptonight_core_gpu_phase2(const int threads, const int bfactor, const int for(int i = start; i < end; i++) // end = 262144 { uint32_t c[4]; - uint32_t j = (a[0] >> 2) & 0x7FFFC; + uint32_t j = (a[0] >> 2) & E2I_MASK2; cn_aes_single_round(sharedMemory, &long_state[j], c, a); XOR_BLOCKS_DST(c, b, &long_state[j]); - MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & 0x7FFFC]); + MUL_SUM_XOR_DST(c, a, &long_state[(c[0] >> 2) & E2I_MASK2]); - j = (a[0] >> 2) & 0x7FFFC; + j = (a[0] >> 2) & E2I_MASK2; cn_aes_single_round(sharedMemory, &long_state[j], b, a); XOR_BLOCKS_DST(b, c, &long_state[j]); - MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & 0x7FFFC]); + MUL_SUM_XOR_DST(b, a, &long_state[(b[0] >> 2) & E2I_MASK2]); } if(bfactor > 0) diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp index ec02851..66b3cf4 100644 --- a/crypto/cryptonight-cpu.cpp +++ b/crypto/cryptonight-cpu.cpp @@ -81,7 +81,7 @@ static void (* const extra_hashes[4])(const void*, size_t, void *) = { do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash }; -uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) +static uint64_t mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) { // multiplier = ab = a * 2^32 + b // multiplicand = cd = c * 2^32 + d diff --git a/crypto/cuda_cryptonight_extra.cu b/crypto/cryptonight-extra.cu similarity index 98% rename from crypto/cuda_cryptonight_extra.cu rename to crypto/cryptonight-extra.cu index c3e661e..21aee47 100644 --- a/crypto/cuda_cryptonight_extra.cu +++ b/crypto/cryptonight-extra.cu @@ -7,13 +7,10 @@ #include #include -//#include -//#include - #include "cryptonight.h" -typedef unsigned char BitSequence; -typedef unsigned long long DataLength; +typedef uint8_t BitSequence; +typedef uint64_t DataLength; static uint32_t *d_input[MAX_GPUS] = { 0 }; static uint32_t *d_target[MAX_GPUS]; diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index 564d68f..db3d4ed 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -1,12 +1,4 @@ -#include -#include -#include -#include -#include -#include - -#include #include "cryptonight.h" extern char *device_config[MAX_GPUS]; // -l 32x16 diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index 3dd4848..e534bd9 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -21,6 +21,9 @@ struct uint3 blockDim; #define MEMORY (1 << 21) // 2 MiB / 2097152 B #define ITER (1 << 20) // 1048576 +#define E2I_MASK1 0x1FFFF0 +#define E2I_MASK2 (0x1FFFF0 >> 2) + #define AES_BLOCK_SIZE 16 #define AES_KEY_SIZE 32 #define INIT_SIZE_BLK 8 diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp index 00e8d56..0935000 100644 --- a/crypto/xmr-rpc.cpp +++ b/crypto/xmr-rpc.cpp @@ -38,6 +38,9 @@ double target_to_diff_rpc2(uint32_t* target) // simplified to get 1.0 for 1000 return (double) (UINT32_MAX / target[7]) / 1000; } + else if (opt_algo == ALGO_CRYPTOLIGHT && target[7]) { + return (double) (UINT32_MAX / target[7]) / 1000; + } else if (opt_algo == ALGO_WILDKECCAK) { return target_to_diff(target) * 1000; } @@ -530,6 +533,15 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work) work_set_target_ratio(work, (uint32_t*) hash); } + else if (opt_algo == ALGO_CRYPTOLIGHT) { + uint32_t nonce; + memcpy(&nonce, &data[39], 4); + noncestr = bin2hex((unsigned char*) &nonce, 4); + last_found_nonce = nonce; + cryptolight_hash(hash, data, 76); + work_set_target_ratio(work, (uint32_t*) hash); + } + else if (opt_algo == ALGO_CRYPTONIGHT) { uint32_t nonce; memcpy(&nonce, &data[39], 4); @@ -1268,15 +1280,6 @@ int rpc2_stratum_thread_stuff(struct pool_infos* pool) } } - // if getjob supported - if(0 && opt_algo == ALGO_CRYPTONIGHT) { - if(!rpc2_stratum_request_job(&stratum)) { - stratum_disconnect(&stratum); - applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); - sleep(opt_fail_pause); - } - } - /* save every 12 hours */ if ((time(NULL) - prev_save) > 12*3600) { store_scratchpad_to_file(false); diff --git a/miner.h b/miner.h index 8348d98..5a5bb08 100644 --- a/miner.h +++ b/miner.h @@ -267,6 +267,7 @@ extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -316,6 +317,7 @@ extern void free_blake256(int thr_id); extern void free_blake2s(int thr_id); extern void free_bmw(int thr_id); extern void free_c11(int thr_id); +extern void free_cryptolight(int thr_id); extern void free_cryptonight(int thr_id); extern void free_decred(int thr_id); extern void free_deep(int thr_id); @@ -805,6 +807,7 @@ void blake256hash(void *output, const void *input, int8_t rounds); void blake2s_hash(void *output, const void *input); void bmw_hash(void *state, const void *input); void c11hash(void *output, const void *input); +void cryptolight_hash(void* output, const void* input, int len); void cryptonight_hash(void* output, const void* input, size_t len); void decred_hash(void *state, const void *input); void deephash(void *state, const void *input); diff --git a/pools.cpp b/pools.cpp index 9da2bab..c8fb1ba 100644 --- a/pools.cpp +++ b/pools.cpp @@ -255,6 +255,7 @@ bool pool_switch(int thr_id, int pooln) stratum = p->stratum; stratum.pooln = cur_pooln; stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT); + stratum.rpc2 |= p->algo == ALGO_CRYPTOLIGHT; // unlock the stratum thread tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); @@ -278,6 +279,7 @@ bool pool_switch(int thr_id, int pooln) } stratum.rpc2 = (p->algo == ALGO_WILDKECCAK || p->algo == ALGO_CRYPTONIGHT); + stratum.rpc2 |= p->algo == ALGO_CRYPTOLIGHT; return true; } diff --git a/util.cpp b/util.cpp index 9b535e8..89528b7 100644 --- a/util.cpp +++ b/util.cpp @@ -2141,6 +2141,9 @@ void print_hash_tests(void) c11hash(&hash[0], &buf[0]); printpfx("c11", hash); + cryptolight_hash(&hash[0], &buf[0], 76); + printpfx("cryptolight", hash); + cryptonight_hash(&hash[0], &buf[0], 76); printpfx("cryptonight", hash);