ccminer-gostd-lite/neoscrypt/neoscrypt.cpp

#include <cuda_runtime.h>
#include <string.h>
#include <miner.h>

#include "neoscrypt.h"

extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);

extern void neoscrypt_init(int thr_id, uint32_t threads);
extern void neoscrypt_free(int thr_id);
extern void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);

static bool init[MAX_GPUS] = { 0 };

int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t _ALIGN(64) endiandata[20];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];

	int dev_id = device_map[thr_id];
	int intensity = is_windows() ? 18 : 19;
	if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB

	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
	throughput = throughput / 32; /* set for max intensity ~= 20 */
	api_set_throughput(thr_id, throughput);

	if (opt_benchmark)
		ptarget[7] = 0x00ff;

	if (!init[thr_id])
	{
		cudaDeviceSynchronize();
		cudaSetDevice(dev_id);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			cudaGetLastError(); // reset errors if device is not "reset"
		}
		if (device_sm[dev_id] <= 300) {
			gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
			proper_exit(EXIT_CODE_CUDA_ERROR);
		}
		gpulog(LOG_INFO, thr_id, "Intensity set to %g (+5), %u cuda threads", throughput2intensity(throughput), throughput);

		neoscrypt_init(thr_id, throughput);

		init[thr_id] = true;
	}

	if (have_stratum) {
		for (int k = 0; k < 20; k++)
			be32enc(&endiandata[k], pdata[k]);
	} else {
		for (int k = 0; k < 20; k++)
			endiandata[k] = pdata[k];
	}

	neoscrypt_setBlockTarget(endiandata,ptarget);

	do {
		memset(work->nonces, 0xff, sizeof(work->nonces));
		neoscrypt_hash_k4(thr_id, throughput, pdata[19], work->nonces, have_stratum);

		*hashes_done = pdata[19] - first_nonce + throughput;

		if (work->nonces[0] != UINT32_MAX)
		{
			const uint32_t Htarg = ptarget[7];
			uint32_t _ALIGN(64) vhash[8];

			if (have_stratum) {
				be32enc(&endiandata[19], work->nonces[0]);
			} else {
				endiandata[19] = work->nonces[0];
			}
			neoscrypt((uchar*)vhash, (uchar*) endiandata, 0x80000620U);

			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
				work->valid_nonces = 1;
				work_set_target_ratio(work, vhash);
				pdata[19] = work->nonces[0] + 1; // cursor
				return work->valid_nonces;
			}
			else if (vhash[7] > Htarg) {
				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", work->nonces[0]);
			}
		}

		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
void free_neoscrypt(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	neoscrypt_free(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
neoscrypt: fixes for windows 9 years ago			`#include <cuda_runtime.h>`
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 8 years ago			`#include <string.h>`
			`#include <miner.h>`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 8 years ago			`#include "neoscrypt.h"`

			`extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 8 years ago			`extern void neoscrypt_init(int thr_id, uint32_t threads);`
			`extern void neoscrypt_free(int thr_id);`
			`extern void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago
neoscrypt: fixes for windows 9 years ago			`static bool init[MAX_GPUS] = { 0 };`

start v1.7, apply new prototypes to all algos 9 years ago			`int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`{`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t _ALIGN(64) endiandata[20];`
			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`const uint32_t first_nonce = pdata[19];`

benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`int dev_id = device_map[thr_id];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`int intensity = is_windows() ? 18 : 19;`
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 8 years ago			`if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB`
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 8 years ago
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`throughput = throughput / 32; /* set for max intensity ~= 20 */`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`api_set_throughput(thr_id, throughput);`

neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 9 years ago			`if (opt_benchmark)`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`ptarget[7] = 0x00ff;`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 9 years ago
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`if (!init[thr_id])`
			`{`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`cudaDeviceSynchronize();`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 9 years ago			`cudaSetDevice(dev_id);`
neoscrypt: reduce a bit the cpu usage 8 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`cudaGetLastError(); // reset errors if device is not "reset"`
			`}`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 9 years ago			`if (device_sm[dev_id] <= 300) {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 9 years ago			`proper_exit(EXIT_CODE_CUDA_ERROR);`
			`}`
bench: skip the disabled whirlpoolx + veltor free + some missed/extra log things... 8 years ago			`gpulog(LOG_INFO, thr_id, "Intensity set to %g (+5), %u cuda threads", throughput2intensity(throughput), throughput);`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 9 years ago
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 8 years ago			`neoscrypt_init(thr_id, throughput);`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 9 years ago
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`init[thr_id] = true;`
			`}`

neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 9 years ago			`if (have_stratum) {`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`for (int k = 0; k < 20; k++)`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 9 years ago			`be32enc(&endiandata[k], pdata[k]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`} else {`
			`for (int k = 0; k < 20; k++)`
			`endiandata[k] = pdata[k];`
			`}`

			`neoscrypt_setBlockTarget(endiandata,ptarget);`

			`do {`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`memset(work->nonces, 0xff, sizeof(work->nonces));`
			`neoscrypt_hash_k4(thr_id, throughput, pdata[19], work->nonces, have_stratum);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`*hashes_done = pdata[19] - first_nonce + throughput;`

migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`if (work->nonces[0] != UINT32_MAX)`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`{`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`const uint32_t Htarg = ptarget[7];`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`uint32_t _ALIGN(64) vhash[8];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago
neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 9 years ago			`if (have_stratum) {`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`be32enc(&endiandata[19], work->nonces[0]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`} else {`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`endiandata[19] = work->nonces[0];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`}`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`neoscrypt((uchar)vhash, (uchar) endiandata, 0x80000620U);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {`
			`work->valid_nonces = 1;`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`work_set_target_ratio(work, vhash);`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`pdata[19] = work->nonces[0] + 1; // cursor`
			`return work->valid_nonces;`
			`}`
			`else if (vhash[7] > Htarg) {`
			`gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", work->nonces[0]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t)throughput + pdata[19] >= max_nonce) {`
			`pdata[19] = max_nonce;`
			`break;`
			`}`

add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`pdata[19] += throughput;`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`} while (!work_restart[thr_id].restart);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 9 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`void free_neoscrypt(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 8 years ago			`neoscrypt_free(thr_id);`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`}`