From 3ede61b0ccddbf73322cb6a2ebe787232b2044f1 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 7 Mar 2017 20:10:26 +0100 Subject: [PATCH] bastion algo Signed-off-by: Tanguy Pruvot --- Makefile.am | 1 + algos.h | 2 + bench.cpp | 1 + ccminer.cpp | 4 + ccminer.vcxproj | 2 + ccminer.vcxproj.filters | 6 + heavy/bastion.cu | 331 ++++++++++++++++++++++++++++++++++++++++ heavy/cuda_bastion.cu | 103 +++++++++++++ heavy/cuda_hefty1.cu | 28 ++++ miner.h | 3 + util.cpp | 3 + 11 files changed, 484 insertions(+) create mode 100644 heavy/bastion.cu create mode 100644 heavy/cuda_bastion.cu diff --git a/Makefile.am b/Makefile.am index e4186b0..e45429f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -28,6 +28,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \ heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \ heavy/cuda_sha256.cu heavy/cuda_sha256.h \ + heavy/bastion.cu heavy/cuda_bastion.cu \ fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c uint256.h \ groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ myriadgroestl.cpp cuda_myriadgroestl.cu \ diff --git a/algos.h b/algos.h index a93219c..32c74f1 100644 --- a/algos.h +++ b/algos.h @@ -9,6 +9,7 @@ enum sha_algos { ALGO_BLAKE, ALGO_BLAKE2S, ALGO_BMW, + ALGO_BASTION, ALGO_C11, ALGO_CRYPTOLIGHT, ALGO_CRYPTONIGHT, @@ -66,6 +67,7 @@ static const char *algo_names[] = { "blake", "blake2s", "bmw", + "bastion", "c11", "cryptolight", "cryptonight", diff --git a/bench.cpp b/bench.cpp index 46db206..24fd315 100644 --- a/bench.cpp +++ b/bench.cpp @@ -44,6 +44,7 @@ void bench_free() void algo_free_all(int thr_id) { // only initialized algos will be freed + free_bastion(thr_id); free_blake256(thr_id); free_blake2s(thr_id); free_bmw(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index a1fae8d..943f7f4 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -223,6 +223,7 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the hash algorithm to use\n\ + bastion Hefty bastion\n\ blake Blake 256 (SFR)\n\ blake2s Blake2-S 256 (NEVA)\n\ blakecoin Fast Blake 256 (8 rounds)\n\ @@ -2196,6 +2197,9 @@ static void *miner_thread(void *userdata) /* scan nonces for a proof-of-work hash */ switch (opt_algo) { + case ALGO_BASTION: + rc = scanhash_bastion(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_BLAKECOIN: rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 8); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 9e926a0..ccbffa0 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -435,6 +435,8 @@ + + -Xptxas "-abi=yes" %(AdditionalOptions) -Xptxas "-abi=yes" %(AdditionalOptions) diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 0febc2c..f622a08 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -619,6 +619,12 @@ Source Files\CUDA\heavy + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + Source Files\CUDA\quark diff --git a/heavy/bastion.cu b/heavy/bastion.cu new file mode 100644 index 0000000..ec9ba8d --- /dev/null +++ b/heavy/bastion.cu @@ -0,0 +1,331 @@ +/** + * bastion cuda implemention tpruvot@github 2017 + */ + +#include +#include +//#include +#include +#include +#include + +static uint32_t *d_hash[MAX_GPUS]; +static uint32_t* d_hash_br1[MAX_GPUS]; +static uint32_t* d_hash_br2[MAX_GPUS]; + +extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); +extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x11_luffa512_cpu_init(int thr_id, uint32_t threads); +extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); + +extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode); +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_free(int thr_id); + +extern void x11_echo512_cpu_init(int thr_id, uint32_t threads); +extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void bastion_init(const int thr_id, const uint32_t threads); +extern void bastion_free(const int thr_id); + +extern uint32_t bastion_filter2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_hash1, uint32_t* d_hash2); +extern void bastion_merge2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_hash1, uint32_t* d_hash2); + +extern void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce); +extern void hefty_cpu_setBlock(int thr_id, uint32_t threads, void *data, int len); +extern void hefty_cpu_init(int thr_id, uint32_t threads); +extern void hefty_cpu_free(int thr_id); +extern void hefty_copy_hashes(int thr_id, uint32_t threads, uint32_t* d_outputhash); + +#define TRACE(algo) {} + +static bool init[MAX_GPUS] = { 0 }; + +int scanhash_bastion(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + // CUDA will process thousands of threads. + uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ptarget[7] = 0x00ff; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash_br1[thr_id], (size_t) 64 * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput)); + + bastion_init(thr_id, throughput); + hefty_cpu_init(thr_id, throughput); + x11_luffa512_cpu_init(thr_id, throughput); + + quark_skein512_cpu_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x11_echo512_cpu_init(thr_id, throughput); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + hefty_cpu_setBlock(thr_id, throughput, endiandata, 80); + + cuda_check_cpu_setTarget(ptarget); + + do { + uint32_t branchNonces; + int order = 0; + + // hefty + hefty_cpu_hash(thr_id, throughput, pdata[19]); + hefty_copy_hashes(thr_id, throughput, d_hash[thr_id]); + TRACE("hefty :"); + + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("luffa :"); + + // fugue or skein + branchNonces = bastion_filter2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]); + x13_fugue512_cpu_hash_64(thr_id, branchNonces, pdata[19], NULL, d_hash_br1[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput-branchNonces, pdata[19], NULL, d_hash_br2[thr_id], order++); + bastion_merge2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]); + TRACE("perm1 :"); + + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("whirl :"); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + // echo or luffa + branchNonces = bastion_filter2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]); + x11_echo512_cpu_hash_64(thr_id, branchNonces, pdata[19], NULL, d_hash_br1[thr_id], order++); + x11_luffa512_cpu_hash_64(thr_id, throughput-branchNonces, pdata[19], NULL, d_hash_br2[thr_id], order++); + bastion_merge2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]); + TRACE("perm2 :"); + + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + // shabal or whirlpool + branchNonces = bastion_filter2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]); + x14_shabal512_cpu_hash_64(thr_id, branchNonces, pdata[19], NULL, d_hash_br1[thr_id], order++); + x15_whirlpool_cpu_hash_64(thr_id, throughput-branchNonces, pdata[19], NULL, d_hash_br2[thr_id], order++); + bastion_merge2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]); + TRACE("perm3 :"); + + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + // hamsi or luffa + branchNonces = bastion_filter2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]); + x13_hamsi512_cpu_hash_64(thr_id, branchNonces, pdata[19], NULL, d_hash_br1[thr_id], order++); + x11_luffa512_cpu_hash_64(thr_id, throughput-branchNonces, pdata[19], NULL, d_hash_br2[thr_id], order++); + bastion_merge2(thr_id, throughput, d_hash[thr_id], d_hash_br1[thr_id], d_hash_br2[thr_id]); + TRACE("perm4 :"); + + *hashes_done = pdata[19] - first_nonce + throughput; + + CUDA_LOG_ERROR(); + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + if (work->nonces[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; + const uint32_t Htarg = ptarget[7]; + endiandata[19] = work->nonces[0]; + bastionhash(vhash, (uchar*) endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + work->nonces[0] = swab32(work->nonces[0]); + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + if (work->nonces[1] != 0) { + endiandata[19] = work->nonces[1]; + bastionhash(vhash, (uchar*) endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + work->nonces[1] = swab32(work->nonces[1]); + pdata[19] = max(work->nonces[0], work->nonces[1])+1; + } else { + pdata[19] = work->nonces[0]+1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) gpulog(LOG_WARNING, thr_id, + "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = swab32(work->nonces[0]) + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + + CUDA_LOG_ERROR(); + + return 0; +} + +// cleanup +extern "C" void free_bastion(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + cudaFree(d_hash_br1[thr_id]); + cudaFree(d_hash_br2[thr_id]); + + hefty_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x15_whirlpool_cpu_free(thr_id); + + bastion_free(thr_id); + cuda_check_cpu_free(thr_id); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} + +#undef SPH_C32 +#undef SPH_T32 +#undef SPH_C64 +#undef SPH_T64 +extern "C" { +#include "hefty1.h" +#include "sph/sph_luffa.h" +#include "sph/sph_fugue.h" +#include "sph/sph_skein.h" +#include "sph/sph_whirlpool.h" +#include "sph/sph_shabal.h" +#include "sph/sph_echo.h" +#include "sph/sph_hamsi.h" +} + +__host__ +void bastionhash(void* output, const uchar* input) +{ + unsigned char _ALIGN(128) hash[64] = { 0 }; + + sph_echo512_context ctx_echo; + sph_luffa512_context ctx_luffa; + sph_fugue512_context ctx_fugue; + sph_whirlpool_context ctx_whirlpool; + sph_shabal512_context ctx_shabal; + sph_skein512_context ctx_skein; + sph_hamsi512_context ctx_hamsi; + + HEFTY1(input, 80, hash); + + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, hash, 64); + sph_luffa512_close(&ctx_luffa, hash); + + if (hash[0] & 0x8) + { + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, hash, 64); + sph_fugue512_close(&ctx_fugue, hash); + } else { + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, hash, 64); + sph_skein512_close(&ctx_skein, hash); + } + + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, hash, 64); + sph_whirlpool_close(&ctx_whirlpool, hash); + + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, hash, 64); + sph_fugue512_close(&ctx_fugue, hash); + + if (hash[0] & 0x8) + { + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, hash, 64); + sph_echo512_close(&ctx_echo, hash); + } else { + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, hash, 64); + sph_luffa512_close(&ctx_luffa, hash); + } + + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, hash, 64); + sph_shabal512_close(&ctx_shabal, hash); + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, hash, 64); + sph_skein512_close(&ctx_skein, hash); + + if (hash[0] & 0x8) + { + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, hash, 64); + sph_shabal512_close(&ctx_shabal, hash); + } else { + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, hash, 64); + sph_whirlpool_close(&ctx_whirlpool, hash); + } + + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, hash, 64); + sph_shabal512_close(&ctx_shabal, hash); + + if (hash[0] & 0x8) + { + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, hash, 64); + sph_hamsi512_close(&ctx_hamsi, hash); + } else { + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, hash, 64); + sph_luffa512_close(&ctx_luffa, hash); + } + + memcpy(output, hash, 32); +} diff --git a/heavy/cuda_bastion.cu b/heavy/cuda_bastion.cu new file mode 100644 index 0000000..44e301a --- /dev/null +++ b/heavy/cuda_bastion.cu @@ -0,0 +1,103 @@ +#include + +#include "cuda_helper.h" + +static uint32_t *d_offsets1[MAX_GPUS] = { 0 }; +static uint32_t *d_offsets2[MAX_GPUS] = { 0 }; + +static uint32_t *d_brcount1[MAX_GPUS] = { 0 }; +static uint32_t *d_brcount2[MAX_GPUS] = { 0 }; + +__global__ __launch_bounds__(128, 6) +void bastion_filter2_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_hash1, uint32_t* d_hash2, uint32_t* d_br_ofts1, uint32_t* d_count1, uint32_t* d_br_ofts2, uint32_t* d_count2) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t); + uint4 *psrc = (uint4*) (&d_hash[offset]); + uint4 *pdst; + d_br_ofts1[thread] = 0; + d_br_ofts2[thread] = 0; + if (((uint8_t*)psrc)[0] & 0x8) { + // uint4 = 4x uint32_t = 16 bytes + uint32_t oft = atomicAdd(d_count1, 1U) * 16U; + d_br_ofts1[thread] = oft + 16U; + pdst = (uint4*) (&d_hash1[oft]); + } else { + uint32_t oft = atomicAdd(d_count2, 1U) * 16U; + d_br_ofts2[thread] = oft + 16U; + pdst = (uint4*) (&d_hash2[oft]); + } + pdst[0] = psrc[0]; + pdst[1] = psrc[1]; + pdst[2] = psrc[2]; + pdst[3] = psrc[3]; + } +} + +__global__ __launch_bounds__(128, 6) +void bastion_merge2_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_hash1, uint32_t* d_hash2, uint32_t* d_br_ofts1, uint32_t* d_br_ofts2) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 16U; + uint4 *pdst = (uint4*) (&d_hash[offset]); + uint4 *psrc; + if (d_br_ofts1[thread]) { + const uint32_t oft = d_br_ofts1[thread] - 16U; + psrc = (uint4*) (&d_hash1[oft]); + } else { + const uint32_t oft = d_br_ofts2[thread] - 16U; + psrc = (uint4*) (&d_hash2[oft]); + } + pdst[0] = psrc[0]; + pdst[1] = psrc[1]; + pdst[2] = psrc[2]; + pdst[3] = psrc[3]; + } +} + + +__host__ +void bastion_init(const int thr_id, const uint32_t threads) +{ + CUDA_SAFE_CALL(cudaMalloc(&d_offsets1[thr_id], sizeof(uint32_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&d_offsets2[thr_id], sizeof(uint32_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&d_brcount1[thr_id], sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_brcount2[thr_id], sizeof(uint32_t))); +} + +__host__ +void bastion_free(const int thr_id) +{ + cudaFree(d_offsets1[thr_id]); + cudaFree(d_offsets2[thr_id]); + cudaFree(d_brcount1[thr_id]); + cudaFree(d_brcount2[thr_id]); +} + +__host__ +uint32_t bastion_filter2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_hash1, uint32_t* d_hash2) +{ + uint32_t num = 0; + cudaMemset(d_brcount1[thr_id], 0, 4); + cudaMemset(d_brcount2[thr_id], 0, 4); + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + bastion_filter2_gpu <<>> (threads, inpHashes, d_hash1, d_hash2, d_offsets1[thr_id], d_brcount1[thr_id], d_offsets2[thr_id], d_brcount2[thr_id]); + cudaMemcpy(&num, d_brcount1[thr_id], 4, cudaMemcpyDeviceToHost); + return num; +} + +__host__ +void bastion_merge2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_hash1, uint32_t* d_hash2) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + // put back branch hashes to the common buffer d_hash + bastion_merge2_gpu <<>> (threads, outpHashes, d_hash1, d_hash2, d_offsets1[thr_id], d_offsets2[thr_id]); +} diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu index a221190..7700acc 100644 --- a/heavy/cuda_hefty1.cu +++ b/heavy/cuda_hefty1.cu @@ -417,3 +417,31 @@ void hefty_cpu_hash(int thr_id, uint32_t threads, int startNounce) // Strategisches Sleep Kommando zur Senkung der CPU Last MyStreamSynchronize(NULL, 0, thr_id); } + +__global__ +__launch_bounds__(128, 8) +void hefty_gpu_copy(const uint32_t threads, uint32_t* d_heftyhash, uint64_t* d_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t offset = thread * 8U; // 32 / sizeof(uint32_t); + uint4 *psrc = (uint4*) (&d_heftyhash[offset]); + uint4 *pdst = (uint4*) (&d_hash[offset]); + pdst[0] = psrc[0]; + pdst[1] = psrc[1]; + pdst[2] = make_uint4(0,0,0,0); + pdst[3] = make_uint4(0,0,0,0); + } +} + +__host__ +void hefty_copy_hashes(int thr_id, uint32_t threads, uint32_t* d_outputhash) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + hefty_gpu_copy <<< grid, block >>> (threads, heavy_heftyHashes[thr_id], (uint64_t*) d_outputhash); + cudaStreamSynchronize(NULL); +} + diff --git a/miner.h b/miner.h index 53d2f70..7b40856 100644 --- a/miner.h +++ b/miner.h @@ -273,6 +273,7 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len); struct work; +extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds); extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -326,6 +327,7 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc /* free device allocated memory per algo */ void algo_free_all(int thr_id); +extern void free_bastion(int thr_id); extern void free_blake256(int thr_id); extern void free_blake2s(int thr_id); extern void free_bmw(int thr_id); @@ -846,6 +848,7 @@ void applog_hash64(void *hash); void applog_compare_hash(void *hash, void *hash_ref); void print_hash_tests(void); +void bastionhash(void* output, const unsigned char* input); void blake256hash(void *output, const void *input, int8_t rounds); void blake2s_hash(void *output, const void *input); void bmw_hash(void *state, const void *input); diff --git a/util.cpp b/util.cpp index cb26559..144f69d 100644 --- a/util.cpp +++ b/util.cpp @@ -2139,6 +2139,9 @@ void print_hash_tests(void) printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n"); + bastionhash(&hash[0], &buf[0]); + printpfx("bastion", hash); + blake256hash(&hash[0], &buf[0], 8); printpfx("blakecoin", hash);