ccminer-gostd-lite/lyra2/lyra2REv2.cu

extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_skein.h"
#include "sph/sph_keccak.h"
#include "sph/sph_cubehash.h"
#include "lyra2/Lyra2.h"
}

#include <miner.h>
#include <cuda_helper.h>

static uint64_t *d_hash[MAX_GPUS];
static uint64_t* d_matrix[MAX_GPUS];

extern void blake256_cpu_init(int thr_id, uint32_t threads);
extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
extern void blake256_cpu_setBlock_80(uint32_t *pdata);
extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void keccak256_cpu_init(int thr_id, uint32_t threads);
extern void keccak256_cpu_free(int thr_id);
extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void skein256_cpu_init(int thr_id, uint32_t threads);
extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);

extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);

extern void bmw256_setTarget(const void *ptarget);
extern void bmw256_cpu_init(int thr_id, uint32_t threads);
extern void bmw256_cpu_free(int thr_id);
extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);

void lyra2v2_hash(void *state, const void *input)
{
	uint32_t hashA[8], hashB[8];

	sph_blake256_context      ctx_blake;
	sph_keccak256_context     ctx_keccak;
	sph_skein256_context      ctx_skein;
	sph_bmw256_context        ctx_bmw;
	sph_cubehash256_context   ctx_cube;

	sph_blake256_set_rounds(14);

	sph_blake256_init(&ctx_blake);
	sph_blake256(&ctx_blake, input, 80);
	sph_blake256_close(&ctx_blake, hashA);

	sph_keccak256_init(&ctx_keccak);
	sph_keccak256(&ctx_keccak, hashA, 32);
	sph_keccak256_close(&ctx_keccak, hashB);

	sph_cubehash256_init(&ctx_cube);
	sph_cubehash256(&ctx_cube, hashB, 32);
	sph_cubehash256_close(&ctx_cube, hashA);

	LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);

	sph_skein256_init(&ctx_skein);
	sph_skein256(&ctx_skein, hashB, 32);
	sph_skein256_close(&ctx_skein, hashA);

	sph_cubehash256_init(&ctx_cube);
	sph_cubehash256(&ctx_cube, hashA, 32);
	sph_cubehash256_close(&ctx_cube, hashB);

	sph_bmw256_init(&ctx_bmw);
	sph_bmw256(&ctx_bmw, hashB, 32);
	sph_bmw256_close(&ctx_bmw, hashA);

	memcpy(state, hashA, 32);
}

static bool init[MAX_GPUS] = { 0 };

extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	int dev_id = device_map[thr_id];
	int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
	if (strstr(device_name[dev_id], "GTX 10")) intensity = 20;
	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		ptarget[7] = 0x000f;

	if (!init[thr_id])
	{
		size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
		cudaSetDevice(dev_id);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}
		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

		blake256_cpu_init(thr_id, throughput);
		keccak256_cpu_init(thr_id,throughput);
		skein256_cpu_init(thr_id, throughput);
		bmw256_cpu_init(thr_id, throughput);

		// SM 3 implentation requires a bit more memory
		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
			matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
			
		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
		lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]);

		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));

		api_set_throughput(thr_id, throughput);
		init[thr_id] = true;
	}

	uint32_t endiandata[20];
	for (int k=0; k < 20; k++)
		be32enc(&endiandata[k], pdata[k]);

	blake256_cpu_setBlock_80(pdata);
	bmw256_setTarget(ptarget);

	do {
		int order = 0;

		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
		lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
		cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);

		memset(work->nonces, 0, sizeof(work->nonces));
		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);

		*hashes_done = pdata[19] - first_nonce + throughput;

		if (work->nonces[0] != 0)
		{
			const uint32_t Htarg = ptarget[7];
			uint32_t _ALIGN(64) vhash[8];
			be32enc(&endiandata[19], work->nonces[0]);
			lyra2v2_hash(vhash, endiandata);

			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
				work->valid_nonces = 1;
				work_set_target_ratio(work, vhash);
				if (work->nonces[1] != 0) {
					be32enc(&endiandata[19], work->nonces[1]);
					lyra2v2_hash(vhash, endiandata);
					bn_set_target_ratio(work, vhash, 1);
					work->valid_nonces++;
					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
				} else {
					pdata[19] = work->nonces[0] + 1; // cursor
				}
				return work->valid_nonces;
			}
			else if (vhash[7] > Htarg) {
				gpu_increment_reject(thr_id);
				if (!opt_quiet)
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
				pdata[19] = work->nonces[0] + 1;
				continue;
			}
		}

		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}
		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart && !abort_flag);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
extern "C" void free_lyra2v2(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	cudaFree(d_hash[thr_id]);
	cudaFree(d_matrix[thr_id]);

	bmw256_cpu_free(thr_id);
	keccak256_cpu_free(thr_id);

	init[thr_id] = false;

	cudaDeviceSynchronize();
}
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`extern "C" {`
			`#include "sph/sph_blake.h"`
			`#include "sph/sph_bmw.h"`
			`#include "sph/sph_skein.h"`
			`#include "sph/sph_keccak.h"`
			`#include "sph/sph_cubehash.h"`
			`#include "lyra2/Lyra2.h"`
			`}`

lyra2: remove old debug traces 8 years ago			`#include <miner.h>`
			`#include <cuda_helper.h>`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
lyra2v2: set a better TPB for intensity 20 (sm52) use sp forced unroll in skein and do some cleanup... 9 years ago			`static uint64_t *d_hash[MAX_GPUS];`
lyra2v2: increase default intensity to be able to say, like sp, that its faster :p 9 years ago			`static uint64_t* d_matrix[MAX_GPUS];`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
			`extern void blake256_cpu_init(int thr_id, uint32_t threads);`
			`extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);`
			`extern void blake256_cpu_setBlock_80(uint32_t *pdata);`
			`extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);`
			`extern void keccak256_cpu_init(int thr_id, uint32_t threads);`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`extern void keccak256_cpu_free(int thr_id);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);`
			`extern void skein256_cpu_init(int thr_id, uint32_t threads);`
lyra2v2, bmw256 and cubehash256 cleanup + diff fix 9 years ago			`extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
			`extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);`
lyra2v2: increase default intensity to be able to say, like sp, that its faster :p 9 years ago			`extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
			`extern void bmw256_setTarget(const void *ptarget);`
			`extern void bmw256_cpu_init(int thr_id, uint32_t threads);`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`extern void bmw256_cpu_free(int thr_id);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t g_hash, uint32_t resultnonces);`

			`void lyra2v2_hash(void state, const void input)`
			`{`
			`uint32_t hashA[8], hashB[8];`

			`sph_blake256_context ctx_blake;`
			`sph_keccak256_context ctx_keccak;`
			`sph_skein256_context ctx_skein;`
			`sph_bmw256_context ctx_bmw;`
			`sph_cubehash256_context ctx_cube;`

blake: change dynamic round system blakecoin was conflicting with lyra2, set the rounds more properly 9 years ago			`sph_blake256_set_rounds(14);`

Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`sph_blake256_init(&ctx_blake);`
			`sph_blake256(&ctx_blake, input, 80);`
			`sph_blake256_close(&ctx_blake, hashA);`

			`sph_keccak256_init(&ctx_keccak);`
			`sph_keccak256(&ctx_keccak, hashA, 32);`
			`sph_keccak256_close(&ctx_keccak, hashB);`

			`sph_cubehash256_init(&ctx_cube);`
			`sph_cubehash256(&ctx_cube, hashB, 32);`
			`sph_cubehash256_close(&ctx_cube, hashA);`

			`LYRA2(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);`

			`sph_skein256_init(&ctx_skein);`
			`sph_skein256(&ctx_skein, hashB, 32);`
			`sph_skein256_close(&ctx_skein, hashA);`

			`sph_cubehash256_init(&ctx_cube);`
			`sph_cubehash256(&ctx_cube, hashA, 32);`
			`sph_cubehash256_close(&ctx_cube, hashB);`

			`sph_bmw256_init(&ctx_bmw);`
			`sph_bmw256(&ctx_bmw, hashB, 32);`
			`sph_bmw256_close(&ctx_bmw, hashA);`

			`memcpy(state, hashA, 32);`
			`}`

			`static bool init[MAX_GPUS] = { 0 };`

Add --show-diff parameter and fix pool net diff which display submitted block and net difficulty and is able to detect shares above net diff (solved blocs) Note: only made on lyra2v2 and zr5 algos TODO: compute the found diff on all algos... require changes in all scan hash "kernel" function parameters to be continued... 9 years ago			`extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`{`
Add --show-diff parameter and fix pool net diff which display submitted block and net difficulty and is able to detect shares above net diff (solved blocs) Note: only made on lyra2v2 and zr5 algos TODO: compute the found diff on all algos... require changes in all scan hash "kernel" function parameters to be continued... 9 years ago			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`const uint32_t first_nonce = pdata[19];`
lyra2v2: increase default intensity to be able to say, like sp, that its faster :p 9 years ago			`int dev_id = device_map[thr_id];`
lyra2v2: update credits, increase SM 5.0 default int to 19 + small klausT cleanup.. 9 years ago			`int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;`
windows: some default intensity adjustments 8 years ago			`if (strstr(device_name[dev_id], "GTX 10")) intensity = 20;`
windows: add support for SM 2.1, drop SM 3.5 (x86) Mostly to do compatibilty tests, SM 2.1 support is very limited SM 3.0 code should run on SM 3.5 (only a few cards use this arch) As i can't test SM 3.5, its best to let users do their own tests... 9 years ago			`uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);`
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
			`if (opt_benchmark)`
lyra2v2: set a better TPB for intensity 20 (sm52) use sp forced unroll in skein and do some cleanup... 9 years ago			`ptarget[7] = 0x000f;`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
			`if (!init[thr_id])`
			`{`
lyra2v2: fix SM 3.5 support May work also on SM 3.0 (to check) 9 years ago			`size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`cudaSetDevice(dev_id);`
lyra2v2: fix SM 3.5 support May work also on SM 3.0 (to check) 9 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`CUDA_LOG_ERROR();`
lyra2v2: fix SM 3.5 support May work also on SM 3.0 (to check) 9 years ago			`}`
Show intensity on init for all algos 8 years ago			`gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`blake256_cpu_init(thr_id, throughput);`
			`keccak256_cpu_init(thr_id,throughput);`
			`skein256_cpu_init(thr_id, throughput);`
			`bmw256_cpu_init(thr_id, throughput);`

lyra2v2: fix SM 3.5 support May work also on SM 3.0 (to check) 9 years ago			`// SM 3 implentation requires a bit more memory`
			`if (device_sm[dev_id] < 500 \|\| cuda_arch[dev_id] < 500)`
			`matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;`

			`CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));`
lyra2v2: increase default intensity to be able to say, like sp, that its faster :p 9 years ago			`lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
lyra2v2: increase default intensity to be able to say, like sp, that its faster :p 9 years ago			`CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
various changes, cleanup for the release small fixes to handle better the multi thread per gpu explicitly report than quark is not compatible with SM 2.1 (compact shuffle) 9 years ago			`api_set_throughput(thr_id, throughput);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`init[thr_id] = true;`
			`}`

			`uint32_t endiandata[20];`
			`for (int k=0; k < 20; k++)`
windows: add support for SM 2.1, drop SM 3.5 (x86) Mostly to do compatibilty tests, SM 2.1 support is very limited SM 3.0 code should run on SM 3.5 (only a few cards use this arch) As i can't test SM 3.5, its best to let users do their own tests... 9 years ago			`be32enc(&endiandata[k], pdata[k]);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
			`blake256_cpu_setBlock_80(pdata);`
			`bmw256_setTarget(ptarget);`

			`do {`
			`int order = 0;`

			`blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);`
			`keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);`
lyra2v2, bmw256 and cubehash256 cleanup + diff fix 9 years ago			`cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);`
			`skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);`
lyra2v2, bmw256 and cubehash256 cleanup + diff fix 9 years ago			`cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`memset(work->nonces, 0, sizeof(work->nonces));`
			`bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
lyra2v2: increase default intensity to be able to say, like sp, that its faster :p 9 years ago			`*hashes_done = pdata[19] - first_nonce + throughput;`

migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`if (work->nonces[0] != 0)`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`{`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`const uint32_t Htarg = ptarget[7];`
			`uint32_t _ALIGN(64) vhash[8];`
			`be32enc(&endiandata[19], work->nonces[0]);`
			`lyra2v2_hash(vhash, endiandata);`

			`if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {`
			`work->valid_nonces = 1;`
			`work_set_target_ratio(work, vhash);`
			`if (work->nonces[1] != 0) {`
			`be32enc(&endiandata[19], work->nonces[1]);`
			`lyra2v2_hash(vhash, endiandata);`
			`bn_set_target_ratio(work, vhash, 1);`
			`work->valid_nonces++;`
			`pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;`
			`} else {`
			`pdata[19] = work->nonces[0] + 1; // cursor`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`}`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`return work->valid_nonces;`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`}`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`else if (vhash[7] > Htarg) {`
api: report per thread cpu hash checks (ACC/REJ) + update all algos for that... 8 years ago			`gpu_increment_reject(thr_id);`
			`if (!opt_quiet)`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);`
			`pdata[19] = work->nonces[0] + 1;`
			`continue;`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t)throughput + pdata[19] >= max_nonce) {`
			`pdata[19] = max_nonce;`
			`break;`
			`}`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`pdata[19] += throughput;`

lyra2v2: add support for SM 2.1 devices and improve a bit SM 3 perf 9 years ago			`} while (!work_restart[thr_id].restart && !abort_flag);`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`
Import and adapt lyra2v2 not tested on windows and with SM <= 5 9 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`extern "C" void free_lyra2v2(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

lyra2: improve cuda implementation (part 1, SM5+) based on the new djm34 method, 2x faster than first version cleaned and tuned for the GTX 750/960 (linux / cuda 6.5) 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`cudaFree(d_hash[thr_id]);`
lyra2v2: increase default intensity to be able to say, like sp, that its faster :p 9 years ago			`cudaFree(d_matrix[thr_id]);`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`bmw256_cpu_free(thr_id);`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`keccak256_cpu_free(thr_id);`

algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
lyra2v2: increase default intensity to be able to say, like sp, that its faster :p 9 years ago			`}`