ccminer/neoscrypt/neoscrypt.cpp

#include <cuda_runtime.h>
#include <string.h>
#include <miner.h>

#include "neoscrypt.h"

extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);

extern void neoscrypt_init_2stream(int thr_id, uint32_t threads);
extern void neoscrypt_free_2stream(int thr_id);
extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);

static bool init[MAX_GPUS] = { 0 };

int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t _ALIGN(64) endiandata[20];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];

	int dev_id = device_map[thr_id];
	int intensity = is_windows() ? 18 : 19;
	if (strstr(device_name[dev_id], "GTX 10")) intensity = 20; // also need more than 2GB

	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
	throughput = throughput / 32; /* set for max intensity ~= 20 */
	api_set_throughput(thr_id, throughput);

	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1);

	if (opt_benchmark)
		ptarget[7] = 0x00ff;

	if (!init[thr_id])
	{
		cudaDeviceSynchronize();
		cudaSetDevice(dev_id);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			cudaGetLastError(); // reset errors if device is not "reset"
		}

		if (device_sm[dev_id] <= 300) {
			gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
			proper_exit(EXIT_CODE_CUDA_ERROR);
		}

		gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);
		neoscrypt_init_2stream(thr_id, throughput);

		init[thr_id] = true;
	}

	if (have_stratum) {
		for (int k = 0; k < 20; k++)
			be32enc(&endiandata[k], pdata[k]);
	} else {
		for (int k = 0; k < 20; k++)
			endiandata[k] = pdata[k];
	}

	neoscrypt_setBlockTarget(endiandata,ptarget);

	do {
		uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
		neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum);

		*hashes_done = pdata[19] - first_nonce + throughput;

		if (foundNonces[0] != UINT32_MAX)
		{
			uint32_t _ALIGN(64) vhash[8];

			if (have_stratum) {
				be32enc(&endiandata[19], foundNonces[0]);
			} else {
				endiandata[19] = foundNonces[0];
			}
			neoscrypt((uchar*)vhash, (uchar*) endiandata, 0x80000620U);

			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
				work_set_target_ratio(work, vhash);
				pdata[19] = foundNonces[0];
				return 1;
			} else {
				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);
			}
		}

		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
void free_neoscrypt(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	neoscrypt_free_2stream(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
neoscrypt: fixes for windows 10 years ago			`#include <cuda_runtime.h>`
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 8 years ago			`#include <string.h>`
			`#include <miner.h>`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 8 years ago			`#include "neoscrypt.h"`

			`extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago
			`extern void neoscrypt_init_2stream(int thr_id, uint32_t threads);`
			`extern void neoscrypt_free_2stream(int thr_id);`
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 8 years ago			`extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: fixes for windows 10 years ago			`static bool init[MAX_GPUS] = { 0 };`

start v1.7, apply new prototypes to all algos 9 years ago			`int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`{`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t _ALIGN(64) endiandata[20];`
			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`const uint32_t first_nonce = pdata[19];`

benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`int dev_id = device_map[thr_id];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`int intensity = is_windows() ? 18 : 19;`
neoscrypt: reduce spill load and increase pascal def intensity 1 MH/s reached on the 1070 ... 8 years ago			`if (strstr(device_name[dev_id], "GTX 10")) intensity = 20; // also need more than 2GB`

intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`throughput = throughput / 32; /* set for max intensity ~= 20 */`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`api_set_throughput(thr_id, throughput);`

intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`if (opt_benchmark)`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`ptarget[7] = 0x00ff;`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`if (!init[thr_id])`
			`{`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`cudaDeviceSynchronize();`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 10 years ago			`cudaSetDevice(dev_id);`
neoscrypt: reduce a bit the cpu usage 8 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`cudaGetLastError(); // reset errors if device is not "reset"`
			`}`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 10 years ago			`if (device_sm[dev_id] <= 300) {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 10 years ago			`proper_exit(EXIT_CODE_CUDA_ERROR);`
			`}`

neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);`
			`neoscrypt_init_2stream(thr_id, throughput);`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`init[thr_id] = true;`
			`}`

neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 10 years ago			`if (have_stratum) {`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`for (int k = 0; k < 20; k++)`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`be32enc(&endiandata[k], pdata[k]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`} else {`
			`for (int k = 0; k < 20; k++)`
			`endiandata[k] = pdata[k];`
			`}`

			`neoscrypt_setBlockTarget(endiandata,ptarget);`

			`do {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };`
			`neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`*hashes_done = pdata[19] - first_nonce + throughput;`

			`if (foundNonces[0] != UINT32_MAX)`
			`{`
			`uint32_t _ALIGN(64) vhash[8];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 10 years ago			`if (have_stratum) {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`be32enc(&endiandata[19], foundNonces[0]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`} else {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`endiandata[19] = foundNonces[0];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`}`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`neoscrypt((uchar)vhash, (uchar) endiandata, 0x80000620U);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {`
			`work_set_target_ratio(work, vhash);`
			`pdata[19] = foundNonces[0];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`return 1;`
			`} else {`
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t)throughput + pdata[19] >= max_nonce) {`
			`pdata[19] = max_nonce;`
			`break;`
			`}`

add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`pdata[19] += throughput;`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`} while (!work_restart[thr_id].restart);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`void free_neoscrypt(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
neoscrypt: apply last VTC improvements rewrote almost properly ;) 8 years ago			`neoscrypt_free_2stream(thr_id);`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`}`