diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 72e42a8..ed7c026 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -274,6 +274,12 @@ 255 + + + + + + 128 diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 17f81c6..9dc7dc2 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -97,6 +97,9 @@ {af387eac-e9e6-4e91-a5e8-637b1e7a8d93} + + {0f9aec5e-5409-488f-992a-2c108590d1ac} + @@ -515,15 +518,33 @@ Source Files\crypto - - Source Files\crypto\xmr - Source Files\crypto\bbr Source Files\crypto\bbr + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + @@ -802,17 +823,17 @@ Source Files\sia - - Source Files\crypto - - Source Files\crypto + Source Files\CUDA\xmr - Source Files\crypto + Source Files\CUDA\xmr - Source Files\crypto + Source Files\CUDA\xmr + + + Source Files\CUDA diff --git a/crypto/cn_jh.cuh b/crypto/cn_jh.cuh index 4bcb64b..c2df763 100644 --- a/crypto/cn_jh.cuh +++ b/crypto/cn_jh.cuh @@ -181,7 +181,7 @@ void cn_jh_F8(jhHashState *state) } __device__ -void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen) { DataLength index; @@ -222,7 +222,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const BitSequence * __restri /* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */ __device__ -void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ hashval) +void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval) { unsigned int i; //uint32_t *bufptr = (uint32_t *)state->buffer; @@ -268,7 +268,7 @@ void cn_jh_final(jhHashState * __restrict__ state, BitSequence * __restrict__ ha cn_jh_F8(state); } - memcpy(hashval, (unsigned char*)state->x+64+32, 32); + MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8); } __device__ @@ -277,13 +277,14 @@ void cn_jh_init(jhHashState *state, int hashbitlen) state->databitlen = 0; state->datasize_in_buffer = 0; state->hashbitlen = hashbitlen; - memcpy(state->x, d_JH256_H0, 128); + //memcpy(state->x, d_JH256_H0, 128); + MEMCPY8(state->x, d_JH256_H0, 128 / 8); } __device__ -void cn_jh(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval) { - int hashbitlen = 256; + const int hashbitlen = 256; DataLength databitlen = len << 3; jhHashState state; diff --git a/crypto/cn_skein.cuh b/crypto/cn_skein.cuh index 5ff1cb3..2096467 100644 --- a/crypto/cn_skein.cuh +++ b/crypto/cn_skein.cuh @@ -109,24 +109,17 @@ typedef struct { uint8_t b[SKEIN_512_BLOCK_BYTES]; } Skein_512_Ctxt_t; -typedef struct { - Skein_Ctxt_Hdr_t h; - uint64_t X[SKEIN1024_STATE_WORDS]; - uint8_t b[SKEIN1024_BLOCK_BYTES]; -} Skein1024_Ctxt_t; - typedef struct { uint_t statebits; union { Skein_Ctxt_Hdr_t h; Skein_256_Ctxt_t ctx_256; Skein_512_Ctxt_t ctx_512; - Skein1024_Ctxt_t ctx1024; } u; } skeinHashState; __device__ -void cn_skein_init(skeinHashState *state, size_t hashBitLen) +void cn_skein256_init(skeinHashState *state, size_t hashBitLen) { const uint64_t SKEIN_512_IV_256[] = { @@ -150,7 +143,7 @@ void cn_skein_init(skeinHashState *state, size_t hashBitLen) } __device__ -void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) +void cn_skein_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ blkPtr, size_t blkCnt, size_t byteCntAdd) { enum { R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, @@ -226,51 +219,7 @@ void cn_skein512_processblock(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t } __device__ -void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal) -{ - size_t i,n,byteCnt; - uint64_t X[SKEIN_512_STATE_WORDS]; - Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; - //size_t tmp; - //uint8_t *p8; - //uint64_t *p64; - - ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; - - if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) { - - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); - //p8 = &ctx->b[ctx->h.bCnt]; - //tmp = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; - //for( i = 0; i < tmp; i++ ) *(p8+i) = 0; - } - - cn_skein512_processblock(ctx,ctx->b,1,ctx->h.bCnt); - - byteCnt = (ctx->h.hashBitLen + 7) >> 3; - - //uint8_t b[SKEIN_512_BLOCK_BYTES] == 64 - memset(ctx->b,0,sizeof(ctx->b)); - //p64 = (uint64_t *)ctx->b; - //for( i = 0; i < 8; i++ ) *(p64+i) = 0; - - memcpy(X,ctx->X,sizeof(X)); - - for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) { - - ((uint64_t *)ctx->b)[0]= (uint64_t)i; - Skein_Start_New_Type(ctx,OUT_FINAL); - cn_skein512_processblock(ctx,ctx->b,1,sizeof(uint64_t)); - n = byteCnt - i*SKEIN_512_BLOCK_BYTES; - if (n >= SKEIN_512_BLOCK_BYTES) - n = SKEIN_512_BLOCK_BYTES; - memcpy(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); - memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ - } -} - -__device__ -void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt) +void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restrict__ msg, size_t msgByteCnt) { size_t n; @@ -288,14 +237,14 @@ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __r ctx->h.bCnt += n; } - cn_skein512_processblock(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + cn_skein_processblock(ctx, ctx->b, 1, SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; } if (msgByteCnt > SKEIN_512_BLOCK_BYTES) { - n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; - cn_skein512_processblock(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; + cn_skein_processblock(ctx, msg, n, SKEIN_512_BLOCK_BYTES); msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; msg += n * SKEIN_512_BLOCK_BYTES; } @@ -309,11 +258,11 @@ void cn_skein512_update(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __r } __device__ -void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __restrict__ data, DataLength databitlen) +void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen) { if ((databitlen & 7) == 0) { - cn_skein512_update(&state->u.ctx_512,data,databitlen >> 3); + cn_skein_block(&state->u.ctx_512, data, databitlen >> 3); } else { @@ -323,15 +272,46 @@ void cn_skein_update(skeinHashState * __restrict__ state, const BitSequence * __ mask = (uint8_t) (1u << (7 - (databitlen & 7))); b = (uint8_t) ((data[bCnt-1] & (0-mask)) | mask); - cn_skein512_update(&state->u.ctx_512, data, bCnt-1); - cn_skein512_update(&state->u.ctx_512, &b, 1); + cn_skein_block(&state->u.ctx_512, data, bCnt - 1); + cn_skein_block(&state->u.ctx_512, &b, 1); Skein_Set_Bit_Pad_Flag(state->u.h); } } __device__ -void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval) +void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal) +{ + uint64_t X[SKEIN_512_STATE_WORDS]; + Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512; + const int byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; + + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + { + memset(&ctx->b[ctx->h.bCnt], 0, SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + } + + cn_skein_processblock(ctx, ctx->b, 1, ctx->h.bCnt); + + memset(ctx->b, 0, sizeof(ctx->b)); + memcpy(X, ctx->X, sizeof(X)); + + for (int i = 0; i*SKEIN_512_BLOCK_BYTES < byteCnt; i++) + { + int n = byteCnt - i*SKEIN_512_BLOCK_BYTES; + if (n > SKEIN_512_BLOCK_BYTES) n = SKEIN_512_BLOCK_BYTES; + ((uint64_t *)ctx->b)[0] = (uint64_t)i; + Skein_Start_New_Type(ctx, OUT_FINAL); + cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t)); + memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n); + memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time + } +} + +__device__ +void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval) { int hashbitlen = 256; DataLength databitlen = len << 3; @@ -339,7 +319,7 @@ void cn_skein(const BitSequence * __restrict__ data, DataLength len, BitSequence state.statebits = 64*SKEIN_512_STATE_WORDS; - cn_skein_init(&state, hashbitlen); - cn_skein_update(&state, data, databitlen); - cn_skein_final(&state, hashval); + cn_skein256_init(&state, hashbitlen); + cn_skein256_update(&state, data, databitlen); + cn_skein256_final(&state, hashval); } diff --git a/crypto/cryptonight-cpu.cpp b/crypto/cryptonight-cpu.cpp index 31c19aa..ec02851 100644 --- a/crypto/cryptonight-cpu.cpp +++ b/crypto/cryptonight-cpu.cpp @@ -214,6 +214,7 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st int extra_algo = ctx->state.hs.b[0] & 3; extra_hashes[extra_algo](&ctx->state, 200, output); + if (opt_debug) applog(LOG_DEBUG, "extra algo=%d", extra_algo); oaes_free((OAES_CTX **) &ctx->aes_ctx); } diff --git a/crypto/cryptonight.cu b/crypto/cryptonight.cu index 907899e..cd86dbf 100644 --- a/crypto/cryptonight.cu +++ b/crypto/cryptonight.cu @@ -86,7 +86,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ init[thr_id] = true; } - throughput = cuda_default_throughput(thr_id, cn_blocks*cn_blocks); + throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads); do { @@ -144,7 +144,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_ done: gpulog(LOG_DEBUG, thr_id, "nonce %08x exit", nonce); - + work->valid_nonces = res; *nonceptr = nonce; return res; } diff --git a/crypto/cryptonight.h b/crypto/cryptonight.h index 16114c5..e96b9fb 100644 --- a/crypto/cryptonight.h +++ b/crypto/cryptonight.h @@ -11,6 +11,7 @@ struct uint3 { struct uint3 threadIdx; struct uint3 blockIdx; struct uint3 blockDim; +#define atomicExch(p,y) (*p) = y #define __funnelshift_r(a,b,c) 1 #define __syncthreads() #define asm(x) @@ -143,10 +144,6 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line) exit(1); } } - -void hash_permutation(union hash_state *state); -void hash_process(union hash_state *state, const uint8_t *buf, size_t count); - void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2); void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn); diff --git a/crypto/cuda_cryptonight_extra.cu b/crypto/cuda_cryptonight_extra.cu index 0e37af7..c3e661e 100644 --- a/crypto/cuda_cryptonight_extra.cu +++ b/crypto/cuda_cryptonight_extra.cu @@ -110,7 +110,7 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict } __global__ -void cryptonight_extra_gpu_keccakf2(uint32_t threads, uint32_t * __restrict__ d_ctx_state) +void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * __restrict__ d_ctx_state) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if(thread < threads) @@ -123,43 +123,59 @@ void cryptonight_extra_gpu_keccakf2(uint32_t threads, uint32_t * __restrict__ d_ cn_keccakf2(state); - #pragma unroll - for(int i = 0; i < 25; i++) - ctx_state[i] = state[i]; + // to reduce the final kernel stack frame, cut algos in 2 kernels + // ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%) + switch (((uint8_t*)state)[0] & 0x03) + { + case 0: { + uint32_t hash[8]; + cn_blake((uint8_t*)state, 200, (uint8_t*)hash); + ((uint32_t*)ctx_state)[0] = 0; + ((uint32_t*)ctx_state)[6] = hash[6]; + ((uint32_t*)ctx_state)[7] = hash[7]; + break; + } + case 1: { + uint32_t hash[8]; + cn_groestl((BitSequence*)state, 200, (BitSequence*)hash); + ((uint32_t*)ctx_state)[0] = 0; + ((uint32_t*)ctx_state)[6] = hash[6]; + ((uint32_t*)ctx_state)[7] = hash[7]; + break; + } + default: { + #pragma unroll + for(int i = 0; i < 25; i++) + ctx_state[i] = state[i]; + } + } } } __global__ -void cryptonight_extra_gpu_nonces(uint32_t threads, const uint32_t startNonce, const uint32_t * __restrict__ d_ctx_state, +void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state, const uint32_t* d_target, uint32_t * resNonces) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if(thread < threads) { - uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 50]); - uint64_t state[25]; - #pragma unroll - for(int i = 0; i < 25; i++) - state[i] = ctx_state[i]; + uint64_t* const state = &d_ctx_state[thread * 25]; uint32_t hash[8]; switch(((uint8_t *)state)[0] & 0x03) { case 0: { - cn_blake((uint8_t*)state, 200, (uint8_t*)hash); - break; - } - case 1: { - cn_groestl((BitSequence*)state, 200, (BitSequence*)hash); + uint32_t* h32 = (uint32_t*)state; + hash[6] = h32[6]; + hash[7] = h32[7]; break; } case 2: { - // to double check.. - cn_jh((BitSequence*)state, 200, (BitSequence*)hash); + cn_jh256((uint8_t*)state, 200, hash); break; } case 3: { - cn_skein((BitSequence*)state, 200, (BitSequence*)hash); + cn_skein((uint8_t*)state, 200, hash); break; } } @@ -195,7 +211,7 @@ void cryptonight_extra_cpu_init(int thr_id, uint32_t threads) __host__ void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2) { - int threadsperblock = 128; + uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); @@ -207,16 +223,16 @@ void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startN __host__ void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state) { - int threadsperblock = 128; + uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t)); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cryptonight_extra_gpu_keccakf2 <<>> (threads, d_ctx_state); + cryptonight_extra_gpu_keccak <<>> (threads, d_ctx_state); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); - cryptonight_extra_gpu_nonces <<>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]); + cryptonight_extra_gpu_final <<>> (threads, startNonce, (uint64_t*)d_ctx_state, d_target[thr_id], d_result[thr_id]); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__); cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);