ccminer-gostd-lite/x11/x11.cu

extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
}

#include "miner.h"
#include "cuda_helper.h"
#include "cuda_x11.h"

#include <stdio.h>
#include <memory.h>

static uint32_t *d_hash[MAX_GPUS];

// X11 CPU Hash
extern "C" void x11hash(void *output, const void *input)
{
	unsigned char _ALIGN(128) hash[128] = { 0 };

	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11

	sph_blake512_context ctx_blake;
	sph_bmw512_context ctx_bmw;
	sph_groestl512_context ctx_groestl;
	sph_jh512_context ctx_jh;
	sph_keccak512_context ctx_keccak;
	sph_skein512_context ctx_skein;
	sph_luffa512_context ctx_luffa;
	sph_cubehash512_context ctx_cubehash;
	sph_shavite512_context ctx_shavite;
	sph_simd512_context ctx_simd;
	sph_echo512_context ctx_echo;

	sph_blake512_init(&ctx_blake);
	sph_blake512 (&ctx_blake, input, 80);
	sph_blake512_close(&ctx_blake, (void*) hash);

	sph_bmw512_init(&ctx_bmw);
	sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
	sph_bmw512_close(&ctx_bmw, (void*) hash);

	sph_groestl512_init(&ctx_groestl);
	sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
	sph_groestl512_close(&ctx_groestl, (void*) hash);

	sph_skein512_init(&ctx_skein);
	sph_skein512 (&ctx_skein, (const void*) hash, 64);
	sph_skein512_close(&ctx_skein, (void*) hash);

	sph_jh512_init(&ctx_jh);
	sph_jh512 (&ctx_jh, (const void*) hash, 64);
	sph_jh512_close(&ctx_jh, (void*) hash);

	sph_keccak512_init(&ctx_keccak);
	sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
	sph_keccak512_close(&ctx_keccak, (void*) hash);

	sph_luffa512_init(&ctx_luffa);
	sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
	sph_luffa512_close (&ctx_luffa, (void*) hash);

	sph_cubehash512_init(&ctx_cubehash);
	sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
	sph_cubehash512_close(&ctx_cubehash, (void*) hash);

	sph_shavite512_init(&ctx_shavite);
	sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
	sph_shavite512_close(&ctx_shavite, (void*) hash);

	sph_simd512_init(&ctx_simd);
	sph_simd512 (&ctx_simd, (const void*) hash, 64);
	sph_simd512_close(&ctx_simd, (void*) hash);

	sph_echo512_init(&ctx_echo);
	sph_echo512 (&ctx_echo, (const void*) hash, 64);
	sph_echo512_close(&ctx_echo, (void*) hash);

	memcpy(output, hash, 32);
}

//#define _DEBUG
#define _DEBUG_PREFIX "x11"
#include "cuda_debug.cuh"

static bool init[MAX_GPUS] = { 0 };

extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		ptarget[7] = 0x5;

	if (!init[thr_id])
	{
		cudaSetDevice(device_map[thr_id]);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}
		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

		quark_blake512_cpu_init(thr_id, throughput);
		quark_bmw512_cpu_init(thr_id, throughput);
		quark_groestl512_cpu_init(thr_id, throughput);
		quark_skein512_cpu_init(thr_id, throughput);
		quark_keccak512_cpu_init(thr_id, throughput);
		quark_jh512_cpu_init(thr_id, throughput);
		x11_luffaCubehash512_cpu_init(thr_id, throughput);
		x11_shavite512_cpu_init(thr_id, throughput);
		x11_echo512_cpu_init(thr_id, throughput);
		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
			return 0;
		}
		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);

		cuda_check_cpu_init(thr_id, throughput);

		init[thr_id] = true;
	}

	uint32_t endiandata[20];
	for (int k=0; k < 20; k++)
		be32enc(&endiandata[k], pdata[k]);

	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
	cuda_check_cpu_setTarget(ptarget);

	do {
		int order = 0;
		uint32_t foundNonce;

		// Hash with CUDA
		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
		TRACE("blake  :");
		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("bmw    :");
		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("groestl:");
		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("skein  :");
		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("jh512  :");
		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("keccak :");
		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
		TRACE("luffa+c:");
		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("shavite:");
		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("simd   :");
		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("echo => ");

		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
		if (foundNonce != UINT32_MAX)
		{
			const uint32_t Htarg = ptarget[7];
			uint32_t vhash64[8];
			be32enc(&endiandata[19], foundNonce);
			x11hash(vhash64, endiandata);

			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
				int res = 1;
				// check if there was some other ones...
				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
				work_set_target_ratio(work, vhash64);
				*hashes_done = pdata[19] - first_nonce + throughput;
				if (secNonce != 0) {
					be32enc(&endiandata[19], secNonce);
					x11hash(vhash64, endiandata);
					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
						work_set_target_ratio(work, vhash64);
					pdata[21] = secNonce;
					res++;
				}
				pdata[19] = foundNonce;
				return res;
			} else {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
				pdata[19] = foundNonce + 1;
				continue;
			}
		}

		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}
		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
extern "C" void free_x11(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	cudaFree(d_hash[thr_id]);

	quark_blake512_cpu_free(thr_id);
	quark_groestl512_cpu_free(thr_id);
	x11_simd512_cpu_free(thr_id);

	cuda_check_cpu_free(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
cuda: header for common kernel functions (quark/x11) Was thinking about doing that since months ;) lets go 9 years ago			`extern "C" {`
v1.0 - Yo, I heard y'all like X11 11 years ago			`#include "sph/sph_blake.h"`
			`#include "sph/sph_bmw.h"`
			`#include "sph/sph_groestl.h"`
			`#include "sph/sph_skein.h"`
			`#include "sph/sph_jh.h"`
			`#include "sph/sph_keccak.h"`
			`#include "sph/sph_luffa.h"`
			`#include "sph/sph_cubehash.h"`
			`#include "sph/sph_shavite.h"`
			`#include "sph/sph_simd.h"`
			`#include "sph/sph_echo.h"`
Move common check_cpu functions to root 10 years ago			`}`
v1.0 - Yo, I heard y'all like X11 11 years ago
			`#include "miner.h"`
Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`#include "cuda_helper.h"`
cuda: header for common kernel functions (quark/x11) Was thinking about doing that since months ;) lets go 9 years ago			`#include "cuda_x11.h"`
v1.0 - Yo, I heard y'all like X11 11 years ago
Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`#include <stdio.h>`
			`#include <memory.h>`
v1.0 - Yo, I heard y'all like X11 11 years ago
Handle a maximum of 16 gpus (vs 8 before) Some cards have 2 gpus on board... 10 years ago			`static uint32_t *d_hash[MAX_GPUS];`
v1.0 - Yo, I heard y'all like X11 11 years ago
simd512: restore SM3/3.5 perfs Simple change which affect all algos based on SIMD512 fresh, qubit, s3, x11 to x17... 10 years ago			`// X11 CPU Hash`
Implement x14 (cuda + cpu functions) Project was updated for VS2013 and CUDA SDK 6.5 add also a --cputest function to dump cpu hash results TODO: x15 is not fully functional, but first loop seems ok Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`extern "C" void x11hash(void output, const void input)`
v1.0 - Yo, I heard y'all like X11 11 years ago			`{`
import sp skein512 unrolled 64-bytes kernel (+0,6% x11) Quark and S3 are now a bit faster (+1 %) x11 get +0.6 % (+20kH/s on a 750ti, +30kH on a 960) 80 bytes implementation to do/test ... (skein/skein2) but keep my previous version for older devices... 9 years ago			`unsigned char _ALIGN(128) hash[128] = { 0 };`

Implement x14 (cuda + cpu functions) Project was updated for VS2013 and CUDA SDK 6.5 add also a --cputest function to dump cpu hash results TODO: x15 is not fully functional, but first loop seems ok Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11`

			`sph_blake512_context ctx_blake;`
			`sph_bmw512_context ctx_bmw;`
			`sph_groestl512_context ctx_groestl;`
			`sph_jh512_context ctx_jh;`
			`sph_keccak512_context ctx_keccak;`
			`sph_skein512_context ctx_skein;`
			`sph_luffa512_context ctx_luffa;`
			`sph_cubehash512_context ctx_cubehash;`
			`sph_shavite512_context ctx_shavite;`
			`sph_simd512_context ctx_simd;`
			`sph_echo512_context ctx_echo;`

			`sph_blake512_init(&ctx_blake);`
			`sph_blake512 (&ctx_blake, input, 80);`
			`sph_blake512_close(&ctx_blake, (void*) hash);`

			`sph_bmw512_init(&ctx_bmw);`
			`sph_bmw512 (&ctx_bmw, (const void*) hash, 64);`
			`sph_bmw512_close(&ctx_bmw, (void*) hash);`

			`sph_groestl512_init(&ctx_groestl);`
			`sph_groestl512 (&ctx_groestl, (const void*) hash, 64);`
			`sph_groestl512_close(&ctx_groestl, (void*) hash);`

			`sph_skein512_init(&ctx_skein);`
			`sph_skein512 (&ctx_skein, (const void*) hash, 64);`
			`sph_skein512_close(&ctx_skein, (void*) hash);`

			`sph_jh512_init(&ctx_jh);`
			`sph_jh512 (&ctx_jh, (const void*) hash, 64);`
			`sph_jh512_close(&ctx_jh, (void*) hash);`

			`sph_keccak512_init(&ctx_keccak);`
			`sph_keccak512 (&ctx_keccak, (const void*) hash, 64);`
			`sph_keccak512_close(&ctx_keccak, (void*) hash);`

			`sph_luffa512_init(&ctx_luffa);`
			`sph_luffa512 (&ctx_luffa, (const void*) hash, 64);`
			`sph_luffa512_close (&ctx_luffa, (void*) hash);`

			`sph_cubehash512_init(&ctx_cubehash);`
			`sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);`
			`sph_cubehash512_close(&ctx_cubehash, (void*) hash);`

			`sph_shavite512_init(&ctx_shavite);`
			`sph_shavite512 (&ctx_shavite, (const void*) hash, 64);`
			`sph_shavite512_close(&ctx_shavite, (void*) hash);`

			`sph_simd512_init(&ctx_simd);`
			`sph_simd512 (&ctx_simd, (const void*) hash, 64);`
			`sph_simd512_close(&ctx_simd, (void*) hash);`

			`sph_echo512_init(&ctx_echo);`
			`sph_echo512 (&ctx_echo, (const void*) hash, 64);`
			`sph_echo512_close(&ctx_echo, (void*) hash);`

			`memcpy(output, hash, 32);`
v1.0 - Yo, I heard y'all like X11 11 years ago			`}`

whirlpool midstate and debug/trace defines + new cuda_debug.cuh include to trace gpu data Happy new year! Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`//#define _DEBUG`
			`#define _DEBUG_PREFIX "x11"`
			`#include "cuda_debug.cuh"`
debug: x11 algo traces for cuda 7 problem 10 years ago
Handle a maximum of 16 gpus (vs 8 before) Some cards have 2 gpus on board... 10 years ago			`static bool init[MAX_GPUS] = { 0 };`
various small changes heavy: reduce by 256 threads default intensity to all -i 20 cuda: put static thread init bools outside the code (made once) api: fix nvml header to build without 10 years ago
start v1.7, apply new prototypes to all algos 9 years ago			`extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
v1.0 - Yo, I heard y'all like X11 11 years ago			`{`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
v1.0 - Yo, I heard y'all like X11 11 years ago			`const uint32_t first_nonce = pdata[19];`
x11: restore default intensity to 19 on windows 10 years ago			`int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;`
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=2562568;`
attempt to reduce shared mem errors 8 years ago			`//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);`
v1.0 - Yo, I heard y'all like X11 11 years ago
bump to revision V1.1 with Killer Groestl 11 years ago			`if (opt_benchmark)`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`ptarget[7] = 0x5;`
v1.0 - Yo, I heard y'all like X11 11 years ago
			`if (!init[thr_id])`
			`{`
Prepare trap of hardware/mem failures 10 years ago			`cudaSetDevice(device_map[thr_id]);`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`CUDA_LOG_ERROR();`
			`}`
Show intensity on init for all algos 8 years ago			`gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);`
cuda: check for errors on cuda mem alloc 10 years ago
v1.0 - Yo, I heard y'all like X11 11 years ago			`quark_blake512_cpu_init(thr_id, throughput);`
bmw512: indent and restore SM 3.0 compat could be also the source of the problem seen with CUDA 7 restored the code before sp/klaus changes for SM 3.0 devices... 10 years ago			`quark_bmw512_cpu_init(thr_id, throughput);`
v1.0 - Yo, I heard y'all like X11 11 years ago			`quark_groestl512_cpu_init(thr_id, throughput);`
			`quark_skein512_cpu_init(thr_id, throughput);`
			`quark_keccak512_cpu_init(thr_id, throughput);`
			`quark_jh512_cpu_init(thr_id, throughput);`
Luffa and simd merged to one kernal. Small echo rewrite. +10KHASH on the 650(compute 3.0) tpruvot: add Linux Makefile - Force to 80 registers (else -30KH/s) Note : the hashrate seems more constant with this change 10 years ago			`x11_luffaCubehash512_cpu_init(thr_id, throughput);`
v1.0 - Yo, I heard y'all like X11 11 years ago			`x11_shavite512_cpu_init(thr_id, throughput);`
			`x11_echo512_cpu_init(thr_id, throughput);`
Prepare trap of hardware/mem failures 10 years ago			`if (x11_simd512_cpu_init(thr_id, throughput) != 0) {`
			`return 0;`
			`}`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);`
Prepare trap of hardware/mem failures 10 years ago
Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`cuda_check_cpu_init(thr_id, throughput);`
cuda: check for errors on cuda mem alloc 10 years ago
v1.0 - Yo, I heard y'all like X11 11 years ago			`init[thr_id] = true;`
			`}`

			`uint32_t endiandata[20];`
			`for (int k=0; k < 20; k++)`
remove uint32_t cast 10 years ago			`be32enc(&endiandata[k], pdata[k]);`
v1.0 - Yo, I heard y'all like X11 11 years ago
blake80: some changes and launch bounds, no perf changes 10 years ago			`quark_blake512_cpu_setBlock_80(thr_id, endiandata);`
Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`cuda_check_cpu_setTarget(ptarget);`
v1.0 - Yo, I heard y'all like X11 11 years ago
			`do {`
			`int order = 0;`
x11: adapt some blake 256 opts to 512 one blake512: for the moment 6.2ms vs 7.12 before (+10%) 10 years ago			`uint32_t foundNonce;`
v1.0 - Yo, I heard y'all like X11 11 years ago
cuda: check for errors on cuda mem alloc 10 years ago			`// Hash with CUDA`
blake80: some changes and launch bounds, no perf changes 10 years ago			`quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("blake :");`
v1.0 - Yo, I heard y'all like X11 11 years ago			`quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("bmw :");`
v1.0 - Yo, I heard y'all like X11 11 years ago			`quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("groestl:");`
v1.0 - Yo, I heard y'all like X11 11 years ago			`quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("skein :");`
v1.0 - Yo, I heard y'all like X11 11 years ago			`quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("jh512 :");`
v1.0 - Yo, I heard y'all like X11 11 years ago			`quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("keccak :");`
x11: update sp luffa/cube to get closer x11 speeds.. i had to clean it... lot of unused defines... 10 years ago			`x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("luffa+c:");`
v1.0 - Yo, I heard y'all like X11 11 years ago			`x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("shavite:");`
v1.0 - Yo, I heard y'all like X11 11 years ago			`x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
debug: x11 algo traces for cuda 7 problem 10 years ago			`TRACE("simd :");`
v1.0 - Yo, I heard y'all like X11 11 years ago			`x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`TRACE("echo => ");`
v1.0 - Yo, I heard y'all like X11 11 years ago
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`if (foundNonce != UINT32_MAX)`
v1.0 - Yo, I heard y'all like X11 11 years ago			`{`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`const uint32_t Htarg = ptarget[7];`
v1.0 - Yo, I heard y'all like X11 11 years ago			`uint32_t vhash64[8];`
			`be32enc(&endiandata[19], foundNonce);`
			`x11hash(vhash64, endiandata);`

Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`int res = 1;`
			`// check if there was some other ones...`
			`uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);`
diff: use the new function in all algos 9 years ago			`work_set_target_ratio(work, vhash64);`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`*hashes_done = pdata[19] - first_nonce + throughput;`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`if (secNonce != 0) {`
start v1.7, apply new prototypes to all algos 9 years ago			`be32enc(&endiandata[19], secNonce);`
			`x11hash(vhash64, endiandata);`
			`if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)`
diff: use the new function in all algos 9 years ago			`work_set_target_ratio(work, vhash64);`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`pdata[21] = secNonce;`
			`res++;`
			`}`
v1.0 - Yo, I heard y'all like X11 11 years ago			`pdata[19] = foundNonce;`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`return res;`
warnings: use the right device id (device_map[thr_id]) 10 years ago			`} else {`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);`
nvml: separated vendor id to string function for the day nvidia will fix their nvmlDeviceGetPciInfo api.. 10 years ago			`pdata[19] = foundNonce + 1;`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`continue;`
v1.0 - Yo, I heard y'all like X11 11 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t) throughput + pdata[19] >= max_nonce) {`
			`pdata[19] = max_nonce;`
			`break;`
			`}`
v1.0 - Yo, I heard y'all like X11 11 years ago			`pdata[19] += throughput;`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`} while (!work_restart[thr_id].restart);`
v1.0 - Yo, I heard y'all like X11 11 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`
v1.0 - Yo, I heard y'all like X11 11 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`extern "C" void free_x11(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`cudaFree(d_hash[thr_id]);`

use blake512 sp kernels on SM 5+ (80+64) import and keep my code for older archs, like skein 64 reduce the gap between our versions... +150kH x11 GTX 960 / +30kH 750Ti +900kH quark GTX 960 / +230kH 750Ti 9 years ago			`quark_blake512_cpu_free(thr_id);`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`quark_groestl512_cpu_free(thr_id);`
			`x11_simd512_cpu_free(thr_id);`

algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`cuda_check_cpu_free(thr_id);`
			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`}`