ccminer/neoscrypt/neoscrypt.cpp

#include <cuda_runtime.h>
#include <string.h>
#include <miner.h>

#include "neoscrypt.h"

extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);

extern void neoscrypt_init(int thr_id, uint32_t threads);
extern void neoscrypt_free(int thr_id);
extern void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);

static bool init[MAX_GPUS] = { 0 };

int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t _ALIGN(64) endiandata[20];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];

	int dev_id = device_map[thr_id];
	int intensity = is_windows() ? 18 : 19;
	if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB

	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
	throughput = throughput / 32; /* set for max intensity ~= 20 */
	api_set_throughput(thr_id, throughput);

	if (opt_benchmark)
		ptarget[7] = 0x00ff;

	if (!init[thr_id])
	{
		cudaDeviceSynchronize();
		cudaSetDevice(dev_id);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			cudaGetLastError(); // reset errors if device is not "reset"
		}
		gpulog(LOG_INFO, thr_id, "Intensity set to %g (+5), %u cuda threads", throughput2intensity(throughput), throughput);

		if (device_sm[dev_id] <= 300) {
			gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
			proper_exit(EXIT_CODE_CUDA_ERROR);
		}

		gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);
		neoscrypt_init(thr_id, throughput);

		init[thr_id] = true;
	}

	if (have_stratum) {
		for (int k = 0; k < 20; k++)
			be32enc(&endiandata[k], pdata[k]);
	} else {
		for (int k = 0; k < 20; k++)
			endiandata[k] = pdata[k];
	}

	neoscrypt_setBlockTarget(endiandata,ptarget);

	do {
		uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
		neoscrypt_hash_k4(thr_id, throughput, pdata[19], foundNonces, have_stratum);

		*hashes_done = pdata[19] - first_nonce + throughput;

		if (foundNonces[0] != UINT32_MAX)
		{
			uint32_t _ALIGN(64) vhash[8];

			if (have_stratum) {
				be32enc(&endiandata[19], foundNonces[0]);
			} else {
				endiandata[19] = foundNonces[0];
			}
			neoscrypt((uchar*)vhash, (uchar*) endiandata, 0x80000620U);

			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
				work_set_target_ratio(work, vhash);
				pdata[19] = foundNonces[0];
				return 1;
			} else {
				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);
			}
		}

		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
void free_neoscrypt(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	neoscrypt_free(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
neoscrypt: fixes for windows 10 years ago			`#include <cuda_runtime.h>`
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 9 years ago			`#include <string.h>`
			`#include <miner.h>`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 9 years ago			`#include "neoscrypt.h"`

			`extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 9 years ago			`extern void neoscrypt_init(int thr_id, uint32_t threads);`
			`extern void neoscrypt_free(int thr_id);`
			`extern void neoscrypt_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: fixes for windows 10 years ago			`static bool init[MAX_GPUS] = { 0 };`

start v1.7, apply new prototypes to all algos 9 years ago			`int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`{`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t _ALIGN(64) endiandata[20];`
			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`const uint32_t first_nonce = pdata[19];`

benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`int dev_id = device_map[thr_id];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`int intensity = is_windows() ? 18 : 19;`
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 9 years ago			`if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB`
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 9 years ago
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`throughput = throughput / 32; /* set for max intensity ~= 20 */`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`api_set_throughput(thr_id, throughput);`

neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`if (opt_benchmark)`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`ptarget[7] = 0x00ff;`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`if (!init[thr_id])`
			`{`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`cudaDeviceSynchronize();`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 10 years ago			`cudaSetDevice(dev_id);`
neoscrypt: reduce a bit the cpu usage 9 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`cudaGetLastError(); // reset errors if device is not "reset"`
			`}`
Show intensity on init for all algos 8 years ago			`gpulog(LOG_INFO, thr_id, "Intensity set to %g (+5), %u cuda threads", throughput2intensity(throughput), throughput);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 10 years ago			`if (device_sm[dev_id] <= 300) {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 10 years ago			`proper_exit(EXIT_CODE_CUDA_ERROR);`
			`}`

neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);`
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 9 years ago			`neoscrypt_init(thr_id, throughput);`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`init[thr_id] = true;`
			`}`

neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 10 years ago			`if (have_stratum) {`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`for (int k = 0; k < 20; k++)`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`be32enc(&endiandata[k], pdata[k]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`} else {`
			`for (int k = 0; k < 20; k++)`
			`endiandata[k] = pdata[k];`
			`}`

			`neoscrypt_setBlockTarget(endiandata,ptarget);`

			`do {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };`
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 9 years ago			`neoscrypt_hash_k4(thr_id, throughput, pdata[19], foundNonces, have_stratum);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`*hashes_done = pdata[19] - first_nonce + throughput;`

			`if (foundNonces[0] != UINT32_MAX)`
			`{`
			`uint32_t _ALIGN(64) vhash[8];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 10 years ago			`if (have_stratum) {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`be32enc(&endiandata[19], foundNonces[0]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`} else {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`endiandata[19] = foundNonces[0];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`}`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`neoscrypt((uchar)vhash, (uchar) endiandata, 0x80000620U);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {`
			`work_set_target_ratio(work, vhash);`
			`pdata[19] = foundNonces[0];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`return 1;`
			`} else {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 9 years ago			`gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t)throughput + pdata[19] >= max_nonce) {`
			`pdata[19] = max_nonce;`
			`break;`
			`}`

add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`pdata[19] += throughput;`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`} while (!work_restart[thr_id].restart);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`void free_neoscrypt(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
neoscrypt: remove warnings and rename host funcs also reduce the few errors on coins using shared mem 9 years ago			`neoscrypt_free(thr_id);`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`}`