ccminer/neoscrypt/neoscrypt.cpp

#include <cuda_runtime.h>
#include "miner.h"
#include "neoscrypt/neoscrypt.h"

extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
extern void neoscrypt_cpu_init(int thr_id, uint32_t threads);
extern void neoscrypt_cpu_free(int thr_id);
extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order);

static bool init[MAX_GPUS] = { 0 };

int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t _ALIGN(64) endiandata[20];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];

	int dev_id = device_map[thr_id];
	int intensity = is_windows() ? 18 : 19;
	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
	throughput = throughput / 32; /* set for max intensity ~= 20 */
	api_set_throughput(thr_id, throughput);

	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1);

	if (opt_benchmark)
		ptarget[7] = 0x00ff;

	if (!init[thr_id])
	{
		cudaDeviceSynchronize();
		cudaSetDevice(dev_id);
		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
		cudaGetLastError(); // reset errors if device is not "reset"

		if (device_sm[dev_id] <= 300) {
			applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices");
			proper_exit(EXIT_CODE_CUDA_ERROR);
		}

		applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput);
		neoscrypt_cpu_init(thr_id, throughput);

		init[thr_id] = true;
	}

	if (have_stratum) {
		for (int k = 0; k < 20; k++)
			be32enc(&endiandata[k], pdata[k]);
	} else {
		for (int k = 0; k < 20; k++)
			endiandata[k] = pdata[k];
	}

	neoscrypt_setBlockTarget(endiandata,ptarget);

	do {
		uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0);
		if (foundNonce != UINT32_MAX)
		{
			uint32_t _ALIGN(64) vhash64[8];

			*hashes_done = pdata[19] - first_nonce + 1;

			if (have_stratum) {
				be32enc(&endiandata[19], foundNonce);
			} else {
				endiandata[19] = foundNonce;
			}
			neoscrypt((uchar*)vhash64, (uchar*) endiandata, 0x80000620U);

			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
				work_set_target_ratio(work, vhash64);
				pdata[19] = foundNonce;
				return 1;
			} else {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
			}
		}

		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
void free_neoscrypt(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	neoscrypt_cpu_free(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
neoscrypt: fixes for windows 10 years ago			`#include <cuda_runtime.h>`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`#include "miner.h"`
neoscrypt: fixes for windows 10 years ago			`#include "neoscrypt/neoscrypt.h"`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
			`extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`extern void neoscrypt_cpu_init(int thr_id, uint32_t threads);`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`extern void neoscrypt_cpu_free(int thr_id);`
neoscrypt: reduce gpu reg count with sp precalc also prevent bool in cuda prototype, linkage mismatch in vstudio... sigh 10 years ago			`extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: fixes for windows 10 years ago			`static bool init[MAX_GPUS] = { 0 };`

start v1.7, apply new prototypes to all algos 9 years ago			`int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`{`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t _ALIGN(64) endiandata[20];`
			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`const uint32_t first_nonce = pdata[19];`

benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`int dev_id = device_map[thr_id];`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`int intensity = is_windows() ? 18 : 19;`
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`throughput = throughput / 32; /* set for max intensity ~= 20 */`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`api_set_throughput(thr_id, throughput);`

intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`if (opt_benchmark)`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`ptarget[7] = 0x00ff;`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`if (!init[thr_id])`
			`{`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`cudaDeviceSynchronize();`
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 10 years ago			`cudaSetDevice(dev_id);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`cudaGetLastError(); // reset errors if device is not "reset"`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
neoscrypt: strip cuda_vectors.h useless functions fix build break for SM 3.0 (but neoscrypt is not yet compatible) 10 years ago			`if (device_sm[dev_id] <= 300) {`
			`applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices");`
			`proper_exit(EXIT_CODE_CUDA_ERROR);`
			`}`

benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput);`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`neoscrypt_cpu_init(thr_id, throughput);`

add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`init[thr_id] = true;`
			`}`

neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 10 years ago			`if (have_stratum) {`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`for (int k = 0; k < 20; k++)`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`be32enc(&endiandata[k], pdata[k]);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`} else {`
			`for (int k = 0; k < 20; k++)`
			`endiandata[k] = pdata[k];`
			`}`

			`neoscrypt_setBlockTarget(endiandata,ptarget);`

			`do {`
neoscrypt: cleanup... My SM 3.0 functions are ok but djm34 implementation uses too much registers for this arch... 10 years ago			`uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`if (foundNonce != UINT32_MAX)`
			`{`
			`uint32_t _ALIGN(64) vhash64[8];`

			`*hashes_done = pdata[19] - first_nonce + 1;`

neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 10 years ago			`if (have_stratum) {`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`be32enc(&endiandata[19], foundNonce);`
			`} else {`
			`endiandata[19] = foundNonce;`
			`}`
neoscrypt: adapt for visual studio compat todo: fix SM 3.0 builds 10 years ago			`neoscrypt((uchar)vhash64, (uchar) endiandata, 0x80000620U);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
			`if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {`
diff: use the new function in all algos 9 years ago			`work_set_target_ratio(work, vhash64);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`pdata[19] = foundNonce;`
			`return 1;`
			`} else {`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t)throughput + pdata[19] >= max_nonce) {`
			`pdata[19] = max_nonce;`
			`break;`
			`}`

add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`pdata[19] += throughput;`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`} while (!work_restart[thr_id].restart);`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`
add neoscrypt based on djm34 work indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued... 10 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`void free_neoscrypt(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`neoscrypt_cpu_free(thr_id);`
			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`}`