From 4944e1a0984ccefe58174012e92a28213e84f002 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 18 Feb 2016 09:19:25 +0100 Subject: [PATCH] mrM4D vnl, with some changes --- Algo256/blake256.cu | 4 + Algo256/vanilla.cu | 421 ++++++++++++++++++++++++++++++++++++++++ Makefile.am | 4 +- bench.cpp | 2 +- ccminer.cpp | 19 +- ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 7 +- miner.h | 4 +- util.cpp | 4 +- 9 files changed, 448 insertions(+), 18 deletions(-) create mode 100644 Algo256/vanilla.cu diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu index 174a30c..8cd9035 100644 --- a/Algo256/blake256.cu +++ b/Algo256/blake256.cu @@ -33,6 +33,10 @@ extern "C" void blake256hash(void *output, const void *input, int8_t rounds = 14 #include "cuda_helper.h" +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, b) x +#endif + __constant__ uint32_t _ALIGN(32) d_data[12]; /* 8 adapters max */ diff --git a/Algo256/vanilla.cu b/Algo256/vanilla.cu new file mode 100644 index 0000000..6e84be2 --- /dev/null +++ b/Algo256/vanilla.cu @@ -0,0 +1,421 @@ +/** + * Optimized Blake-256 8-rounds Cuda Kernel (Tested on SM >3.0) + * Based upon Blake-256 implementation of Tanguy Pruvot - Nov. 2014 + * + * midstate computation inherited from + * https://github.com/wfr/clblake + * + * Provos Alexis - Jan. 2016 + * Reviewed by tpruvot - Feb 2016 + */ + +#include +#include +#include + +#include "miner.h" + +extern "C" { +#include "sph/sph_blake.h" +} + +#include "cuda_helper.h" + +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, b) x +#endif + +/* threads per block and "magic" */ +#define TPB 768 +#define NPT 224 +#define NBN 2 + +__constant__ uint32_t d_data[16]; + +/* 8 adapters max */ +static uint32_t *d_resNonce[MAX_GPUS]; +static uint32_t *h_resNonce[MAX_GPUS]; + +/* hash by cpu with blake 256 */ +extern "C" void vanillahash(void *output, const void *input, int8_t blakerounds) +{ + uchar hash[64]; + sph_blake256_context ctx; + + sph_blake256_set_rounds(blakerounds); + + sph_blake256_init(&ctx); + sph_blake256(&ctx, input, 80); + sph_blake256_close(&ctx, hash); + + memcpy(output, hash, 32); +} + +__global__ __launch_bounds__(TPB,1) +void vanilla_gpu_hash_16_8(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce,const uint32_t highTarget) +{ + uint32_t v[16]; + uint32_t tmp[13]; + + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + const uint32_t step = gridDim.x * blockDim.x; + const uint32_t maxNonce = startNonce + threads; + + const uint32_t c_u256[16] = { + 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 + }; + + const uint32_t h0 = d_data[0]; const uint32_t h1 = d_data[1]; + const uint32_t h2 = d_data[2]; const uint32_t h3 = d_data[3]; + const uint32_t h4 = d_data[4]; //const uint32_t h5 = d_data[5]; no need + const uint32_t h6 = d_data[5]; const uint32_t h7 = d_data[6]; + const uint32_t m0 = d_data[7]; const uint32_t m1 = d_data[8]; + const uint32_t m2 = d_data[9]; //le' nonce + const uint32_t m4 = 0x80000000UL; const uint32_t m5 = 0; + const uint32_t m6 = 0; const uint32_t m7 = 0; + const uint32_t m8 = 0; const uint32_t m9 = 0; + const uint32_t m10 = 0; const uint32_t m11 = 0; + const uint32_t m12 = 0; const uint32_t m13 = 1; + const uint32_t m14 = 0; const uint32_t m15 = 640; + + //---MORE PRECOMPUTATIONS + tmp[ 0] = d_data[10]; tmp[ 1] = d_data[11]; + tmp[ 2] = d_data[12]; tmp[ 3] = c_u256[1] + tmp[2]; + tmp[ 4] = d_data[13]; tmp[ 5] = d_data[14]; + tmp[ 6] = c_u256[2] + tmp[5]; tmp[ 7] = d_data[15]; + + tmp[ 5] = __byte_perm(tmp[5] ^ h2,0, 0x0321); tmp[ 6] += tmp[5]; + tmp[ 7] = ROTR32(tmp[7] ^ tmp[6],7); tmp[ 8] = __byte_perm(c_u256[7] ^ h3,0, 0x1032); + tmp[ 9] = c_u256[3] + tmp[8]; tmp[10] = ROTR32(h7 ^ tmp[9], 12); + tmp[11] = h3 + c_u256[6] + tmp[10]; + + tmp[ 8] = __byte_perm(tmp[8] ^ tmp[11],0, 0x0321); tmp[ 9] += tmp[8]; + tmp[10] = ROTR32(tmp[10] ^ tmp[9],7); + //---END OF MORE PRECOMPUTATIONS + + for(uint64_t m3 = startNonce + thread ; m3data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const uint32_t targetHigh = ptarget[6]; + int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 30 : 24; + if (device_sm[dev_id] < 350) intensity = 22; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + int rc = 0; + + if (!init[thr_id]) { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage (linux) + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); + CUDA_LOG_ERROR(); + } + CUDA_CALL_OR_RET_X(cudaHostAlloc((void**)&h_resNonce[thr_id], NBN*sizeof(uint32_t), cudaHostAllocMapped),0); + CUDA_CALL_OR_RET_X(cudaHostGetDevicePointer((void**)&d_resNonce[thr_id],(void*)h_resNonce[thr_id], 0),0); + init[thr_id] = true; + } + + uint32_t endiandata[20]; + + for (int k = 0; k < 16; k++) + be32enc(&endiandata[k], pdata[k]); + + vanilla_cpu_setBlock_16(endiandata,&pdata[16]); + + cudaMemset(d_resNonce[thr_id], 0xff, sizeof(uint32_t)); + const dim3 grid((throughput + (NPT*TPB)-1)/(NPT*TPB)); + const dim3 block(TPB); + do { + vanilla_gpu_hash_16_8<<>>(throughput, pdata[19], d_resNonce[thr_id], targetHigh); + cudaThreadSynchronize(); + + if (h_resNonce[thr_id][0] != UINT32_MAX){ + uint32_t vhashcpu[8]; + uint32_t Htarg = (uint32_t)targetHigh; + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + be32enc(&endiandata[19], h_resNonce[thr_id][0]); + vanillahash(vhashcpu, endiandata, blakerounds); + + if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget)){ + rc = 1; + work_set_target_ratio(work, vhashcpu); + *hashes_done = pdata[19] - first_nonce + throughput; + pdata[19] = h_resNonce[thr_id][0]; +#if NBN > 1 + if (h_resNonce[thr_id][1] != UINT32_MAX) { + pdata[21] = h_resNonce[thr_id][1]; + applog(LOG_BLUE, "1:%x 2:%x", h_resNonce[thr_id][0], h_resNonce[thr_id][1]); + rc = 2; + } +#endif + return rc; + } + else { + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[thr_id][0]); + } + } + + pdata[19] += throughput; + } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); + + *hashes_done = pdata[19] - first_nonce; + MyStreamSynchronize(NULL, 0, dev_id); + return rc; +} + +// cleanup +extern "C" void free_vanilla(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFreeHost(h_resNonce[thr_id]); + cudaFree(d_resNonce[thr_id]); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/Makefile.am b/Makefile.am index b2a0290..768ddd4 100644 --- a/Makefile.am +++ b/Makefile.am @@ -36,7 +36,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ - Algo256/blake256.cu Algo256/decred.cu Algo256/keccak256.cu \ + Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \ Algo256/bmw.cu Algo256/cuda_bmw.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ @@ -55,7 +55,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x11/cuda_x11_luffa512_Cubehash.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ - x15/whirlpool.cu x15/whirlpoolx.cu x15/cuda_whirlpoolx.cu \ + x15/whirlpool.cu x15/cuda_whirlpoolx.cu \ x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/cuda_streebog.cu diff --git a/bench.cpp b/bench.cpp index a9998ea..c203f27 100644 --- a/bench.cpp +++ b/bench.cpp @@ -68,8 +68,8 @@ void algo_free_all(int thr_id) free_skein2(thr_id); free_sib(thr_id); free_s3(thr_id); + free_vanilla(thr_id); free_whirl(thr_id); - free_whirlx(thr_id); free_x11(thr_id); free_x13(thr_id); free_x14(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 5e42f6b..5a8fa0a 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -240,10 +240,9 @@ Options:\n\ x14 X14\n\ x15 X15\n\ x17 X17\n\ - vanilla Blake256 (VNL)\n\ + vanilla Blake256-8 (VNL)\n\ whirlcoin Old Whirlcoin (Whirlpool algo)\n\ whirlpool Whirlpool algo\n\ - whirlpoolx WhirlpoolX (VNL)\n\ zr5 ZR5 (ZiftrCoin)\n\ -d, --devices Comma separated list of CUDA devices to use.\n\ Device IDs start counting from 0! Alternatively takes\n\ @@ -1824,7 +1823,7 @@ static void *miner_thread(void *userdata) case ALGO_BLAKE: case ALGO_BMW: case ALGO_DECRED: - case ALGO_WHIRLPOOLX: + //case ALGO_WHIRLPOOLX: minmax = 0x40000000U; break; case ALGO_KECCAK: @@ -1900,7 +1899,6 @@ static void *miner_thread(void *userdata) switch (opt_algo) { case ALGO_BLAKECOIN: - case ALGO_VANILLA: rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 8); break; case ALGO_BLAKE: @@ -1991,13 +1989,16 @@ static void *miner_thread(void *userdata) case ALGO_S3: rc = scanhash_s3(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_VANILLA: + rc = scanhash_vanilla(thr_id, &work, max_nonce, &hashes_done, 8); + break; case ALGO_WHIRLCOIN: case ALGO_WHIRLPOOL: rc = scanhash_whirl(thr_id, &work, max_nonce, &hashes_done); break; - case ALGO_WHIRLPOOLX: - rc = scanhash_whirlx(thr_id, &work, max_nonce, &hashes_done); - break; + //case ALGO_WHIRLPOOLX: + // rc = scanhash_whirlx(thr_id, &work, max_nonce, &hashes_done); + // break; case ALGO_X11: rc = scanhash_x11(thr_id, &work, max_nonce, &hashes_done); break; @@ -3225,8 +3226,8 @@ int main(int argc, char *argv[]) parse_cmdline(argc, argv); // extra credits.. - if (opt_algo == ALGO_WHIRLPOOLX) { - printf(" Whirlpoolx support by Alexis Provos.\n"); + if (opt_algo == ALGO_VANILLA) { + printf(" Vanilla blake optimized by Alexis Provos.\n"); printf("VNL donation address: Vr5oCen8NrY6ekBWFaaWjCUFBH4dyiS57W\n\n"); } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 708f966..71dc31f 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -409,6 +409,7 @@ true + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index cc67ed0..102ec40 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -620,10 +620,13 @@ Source Files\CUDA\x11 - Source Files\CUDA + Source Files\CUDA\Algo256 - Source Files\CUDA + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 Source Files\CUDA diff --git a/miner.h b/miner.h index 1ca1d8b..34df6df 100644 --- a/miner.h +++ b/miner.h @@ -285,8 +285,8 @@ extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsig extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds); extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); -extern int scanhash_whirlx(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -327,8 +327,8 @@ extern void free_sib(int thr_id); extern void free_skeincoin(int thr_id); extern void free_skein2(int thr_id); extern void free_s3(int thr_id); +extern void free_vanilla(int thr_id); extern void free_whirl(int thr_id); -extern void free_whirlx(int thr_id); extern void free_x11(int thr_id); extern void free_x13(int thr_id); extern void free_x14(int thr_id); diff --git a/util.cpp b/util.cpp index e48a8d5..31988f5 100644 --- a/util.cpp +++ b/util.cpp @@ -1997,8 +1997,8 @@ void print_hash_tests(void) wcoinhash(&hash[0], &buf[0]); printpfx("whirlpool", hash); - whirlxHash(&hash[0], &buf[0]); - printpfx("whirlpoolx", hash); + //whirlxHash(&hash[0], &buf[0]); + //printpfx("whirlpoolx", hash); x11hash(&hash[0], &buf[0]); printpfx("X11", hash);