From 1e71dc578248ecdb627b9ebd6c7bcb24d64d22a6 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 16 Aug 2017 13:42:21 +0200 Subject: [PATCH] streebog: apply skunk improvements to veltor --- configure.ac | 2 +- skunk/cuda_skunk_streebog.cu | 2 +- skunk/skunk.cu | 12 ++++++------ x11/cuda_streebog.cu | 5 +++-- x11/veltor.cu | 29 ++++++++++++++++++++++------- 5 files changed, 33 insertions(+), 17 deletions(-) diff --git a/configure.ac b/configure.ac index c369201..2bd9d2e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.2.1], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/skunk/cuda_skunk_streebog.cu b/skunk/cuda_skunk_streebog.cu index c38de11..d44d77b 100644 --- a/skunk/cuda_skunk_streebog.cu +++ b/skunk/cuda_skunk_streebog.cu @@ -204,7 +204,7 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin __constant__ uint64_t target64[4]; __host__ -void skunk_set_target(uint32_t* ptarget) +void skunk_streebog_set_target(uint32_t* ptarget) { cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); } diff --git a/skunk/skunk.cu b/skunk/skunk.cu index f89c5fb..c1add50 100644 --- a/skunk/skunk.cu +++ b/skunk/skunk.cu @@ -23,12 +23,12 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_free(int thr_id); -extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); -extern void streebog_set_target(const uint32_t* ptarget); +extern void streebog_sm3_set_target(uint32_t* ptarget); +extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); // krnlx merged kernel (for high-end cards only) extern void skunk_cpu_init(int thr_id, uint32_t threads); -extern void skunk_set_target(uint32_t* ptarget); +extern void skunk_streebog_set_target(uint32_t* ptarget); extern void skunk_setBlock_80(int thr_id, void *pdata); extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); @@ -117,10 +117,10 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); if (use_compat_kernels[thr_id]) { skein512_cpu_setBlock_80(endiandata); - streebog_set_target(ptarget); + streebog_sm3_set_target(ptarget); } else { skunk_setBlock_80(thr_id, endiandata); - skunk_set_target(ptarget); + skunk_streebog_set_target(ptarget); } do { @@ -129,7 +129,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); + streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); } else { skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); diff --git a/x11/cuda_streebog.cu b/x11/cuda_streebog.cu index 228c691..d6e3685 100644 --- a/x11/cuda_streebog.cu +++ b/x11/cuda_streebog.cu @@ -806,10 +806,11 @@ void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) #define T6(x) shared[6][x] #define T7(x) shared[7][x] +// Streebog final for Veltor and skunk on SM 3.x __constant__ uint64_t target64[4]; __host__ -void streebog_set_target(const uint32_t* ptarget) +void streebog_sm3_set_target(uint32_t* ptarget) { cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice); } @@ -995,7 +996,7 @@ void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce) } __host__ -void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce) +void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce) { dim3 grid((threads + TPB-1) / TPB); dim3 block(TPB); diff --git a/x11/veltor.cu b/x11/veltor.cu index be05f5a..7bc1e18 100644 --- a/x11/veltor.cu +++ b/x11/veltor.cu @@ -10,11 +10,17 @@ extern "C" { #include "cuda_x11.h" extern void skein512_cpu_setBlock_80(void *pdata); -extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); -extern void streebog_set_target(const uint32_t* ptarget); + +// for SM3.x +extern void streebog_sm3_set_target(uint32_t* ptarget); +extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); + +// for latest cards only +extern void skunk_cpu_init(int thr_id, uint32_t threads); +extern void skunk_streebog_set_target(uint32_t* ptarget); +extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); #include #include @@ -23,7 +29,7 @@ extern void streebog_set_target(const uint32_t* ptarget); static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_resNonce[MAX_GPUS]; -// veltorcoin CPU Hash +// veltor CPU Hash extern "C" void veltorhash(void *output, const void *input) { unsigned char _ALIGN(128) hash[128] = { 0 }; @@ -53,6 +59,7 @@ extern "C" void veltorhash(void *output, const void *input) } static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { @@ -80,7 +87,9 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); - quark_skein512_cpu_init(thr_id, throughput); + skunk_cpu_init(thr_id, throughput); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + x11_shavite512_cpu_init(thr_id, throughput); CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); @@ -97,14 +106,20 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce skein512_cpu_setBlock_80(endiandata); cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); - streebog_set_target(ptarget); + if(use_compat_kernels[thr_id]) + streebog_sm3_set_target(ptarget); + else + skunk_streebog_set_target(ptarget); do { int order = 0; skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); + if(use_compat_kernels[thr_id]) + streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); + else + skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);