ccminer/x11/s3.cu

/**
 * S3 Hash (Also called Triple S - Used by 1Coin)
 */

extern "C" {
#include "sph/sph_skein.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
}

#include "miner.h"
#include "cuda_helper.h"
#include "cuda_x11.h"

extern void x11_shavite512_setBlock_80(void *pdata);
extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);

#include <stdint.h>

static uint32_t *d_hash[MAX_GPUS];

/* CPU HASH */
extern "C" void s3hash(void *output, const void *input)
{
	sph_shavite512_context ctx_shavite;
	sph_simd512_context ctx_simd;
	sph_skein512_context ctx_skein;

	unsigned char hash[64];

	sph_shavite512_init(&ctx_shavite);
	sph_shavite512(&ctx_shavite, input, 80);
	sph_shavite512_close(&ctx_shavite, (void*) hash);

	sph_simd512_init(&ctx_simd);
	sph_simd512(&ctx_simd, (const void*) hash, 64);
	sph_simd512_close(&ctx_simd, (void*) hash);

	sph_skein512_init(&ctx_skein);
	sph_skein512(&ctx_skein, (const void*) hash, 64);
	sph_skein512_close(&ctx_skein, (void*) hash);

	memcpy(output, hash, 32);
}

#ifdef _DEBUG
#define TRACE(algo) { \
	if (max_nonce == 1 && pdata[19] <= 1) { \
		uint32_t* debugbuf = NULL; \
		cudaMallocHost(&debugbuf, 32); \
		cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \
		printf("S3 %s %08x %08x %08x %08x...%08x\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
			swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \
		cudaFreeHost(debugbuf); \
	} \
}
#else
#define TRACE(algo) {}
#endif

static bool init[MAX_GPUS] = { 0 };

/* Main S3 entry point */
extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	int intensity = 20; // 256*256*8*2;
#ifdef WIN32
	// reduce by one the intensity on windows
	intensity--;
#endif
	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity);
	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		ptarget[7] = 0xF;

	if (!init[thr_id])
	{
		cudaSetDevice(device_map[thr_id]);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}
		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));

		x11_shavite512_cpu_init(thr_id, throughput);
		x11_simd512_cpu_init(thr_id, throughput);
		quark_skein512_cpu_init(thr_id, throughput);

		cuda_check_cpu_init(thr_id, throughput);

		init[thr_id] = true;
	}

	uint32_t endiandata[20];
	for (int k=0; k < 20; k++)
		be32enc(&endiandata[k], pdata[k]);

	x11_shavite512_setBlock_80((void*)endiandata);
	cuda_check_cpu_setTarget(ptarget);

	do {
		const uint32_t Htarg = ptarget[7];
		uint32_t foundNonce;
		int order = 0;

		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
		TRACE("shavite:");
		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("simd   :");
		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		TRACE("skein  :");

		*hashes_done = pdata[19] - first_nonce + throughput;

		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

		if (foundNonce != UINT32_MAX)
		{
			uint32_t vhash64[8];
			be32enc(&endiandata[19], foundNonce);
			s3hash(vhash64, endiandata);

			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
				int res = 1;
				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
				work_set_target_ratio(work, vhash64);
				if (secNonce != 0) {
					be32enc(&endiandata[19], secNonce);
					s3hash(vhash64, endiandata);
					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
						work_set_target_ratio(work, vhash64);
					pdata[21] = secNonce;
					res++;
				}
				pdata[19] = foundNonce;
				return res;

			} else {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
			}
		}

		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
extern "C" void free_s3(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	cudaFree(d_hash[thr_id]);
	x11_simd512_cpu_free(thr_id);

	cuda_check_cpu_free(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`/**`
cuda: header for common kernel functions (quark/x11) Was thinking about doing that since months ;) lets go 9 years ago			`* S3 Hash (Also called Triple S - Used by 1Coin)`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`*/`

			`extern "C" {`
			`#include "sph/sph_skein.h"`
			`#include "sph/sph_shavite.h"`
			`#include "sph/sph_simd.h"`
			`}`

			`#include "miner.h"`
			`#include "cuda_helper.h"`
cuda: header for common kernel functions (quark/x11) Was thinking about doing that since months ;) lets go 9 years ago			`#include "cuda_x11.h"`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`extern void x11_shavite512_setBlock_80(void *pdata);`
cuda: header for common kernel functions (quark/x11) Was thinking about doing that since months ;) lets go 9 years ago			`extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
cuda: header for common kernel functions (quark/x11) Was thinking about doing that since months ;) lets go 9 years ago			`#include <stdint.h>`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
cuda: header for common kernel functions (quark/x11) Was thinking about doing that since months ;) lets go 9 years ago			`static uint32_t *d_hash[MAX_GPUS];`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`/* CPU HASH */`
			`extern "C" void s3hash(void output, const void input)`
			`{`
			`sph_shavite512_context ctx_shavite;`
			`sph_simd512_context ctx_simd;`
			`sph_skein512_context ctx_skein;`

			`unsigned char hash[64];`

			`sph_shavite512_init(&ctx_shavite);`
			`sph_shavite512(&ctx_shavite, input, 80);`
			`sph_shavite512_close(&ctx_shavite, (void*) hash);`

			`sph_simd512_init(&ctx_simd);`
			`sph_simd512(&ctx_simd, (const void*) hash, 64);`
			`sph_simd512_close(&ctx_simd, (void*) hash);`

			`sph_skein512_init(&ctx_skein);`
			`sph_skein512(&ctx_skein, (const void*) hash, 64);`
			`sph_skein512_close(&ctx_skein, (void*) hash);`

			`memcpy(output, hash, 32);`
			`}`

simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`#ifdef _DEBUG`
			`#define TRACE(algo) { \`
			`if (max_nonce == 1 && pdata[19] <= 1) { \`
			`uint32_t* debugbuf = NULL; \`
			`cudaMallocHost(&debugbuf, 32); \`
			`cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \`
			`printf("S3 %s %08x %08x %08x %08x...%08x\n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \`
			`swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \`
			`cudaFreeHost(debugbuf); \`
			`} \`
			`}`
			`#else`
			`#define TRACE(algo) {}`
			`#endif`

Handle a maximum of 16 gpus (vs 8 before) Some cards have 2 gpus on board... 10 years ago			`static bool init[MAX_GPUS] = { 0 };`
various small changes heavy: reduce by 256 threads default intensity to all -i 20 cuda: put static thread init bools outside the code (made once) api: fix nvml header to build without 10 years ago
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`/* Main S3 entry point */`
start v1.7, apply new prototypes to all algos 9 years ago			`extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`{`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`const uint32_t first_nonce = pdata[19];`
intensity: sign warnings fixes min(i,u) 10 years ago			`int intensity = 20; // 2562568*2;`
Prepare trap of hardware/mem failures 10 years ago			`#ifdef WIN32`
			`// reduce by one the intensity on windows`
			`intensity--;`
s3: reduce a bit the intensity on windows 10 years ago			`#endif`
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1 << intensity);`
attempt to reduce shared mem errors 8 years ago			`//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`if (opt_benchmark)`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`ptarget[7] = 0xF;`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`if (!init[thr_id])`
			`{`
Prepare trap of hardware/mem failures 10 years ago			`cudaSetDevice(device_map[thr_id]);`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`CUDA_LOG_ERROR();`
			`}`
Show intensity on init for all algos 8 years ago			`gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago
			`CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`x11_shavite512_cpu_init(thr_id, throughput);`
			`x11_simd512_cpu_init(thr_id, throughput);`
			`quark_skein512_cpu_init(thr_id, throughput);`

			`cuda_check_cpu_init(thr_id, throughput);`

			`init[thr_id] = true;`
			`}`

			`uint32_t endiandata[20];`
			`for (int k=0; k < 20; k++)`
remove uint32_t cast 10 years ago			`be32enc(&endiandata[k], pdata[k]);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`x11_shavite512_setBlock_80((void*)endiandata);`
			`cuda_check_cpu_setTarget(ptarget);`

			`do {`
			`const uint32_t Htarg = ptarget[7];`
			`uint32_t foundNonce;`
			`int order = 0;`

			`x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`TRACE("shavite:");`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`TRACE("simd :");`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
simd: add support for SM 2.1 devices Add support for x11..x17, s3, fresh and qubit Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`TRACE("skein :");`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
start v1.7, apply new prototypes to all algos 9 years ago			`*hashes_done = pdata[19] - first_nonce + throughput;`

checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`if (foundNonce != UINT32_MAX)`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`{`
			`uint32_t vhash64[8];`
			`be32enc(&endiandata[19], foundNonce);`
			`s3hash(vhash64, endiandata);`

			`if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`int res = 1;`
			`uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);`
diff: use the new function in all algos 9 years ago			`work_set_target_ratio(work, vhash64);`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`if (secNonce != 0) {`
start v1.7, apply new prototypes to all algos 9 years ago			`be32enc(&endiandata[19], secNonce);`
			`s3hash(vhash64, endiandata);`
diff: show by default, rework shares diff storage This will allow later more gpu candidates. Note: This is an unfinished work, we keep the previous behavior for now To finish this, all algos solutions should be migrated and submitted nonces attributes stored. Its required to handle the different share diff per nonce and fix the possible solved count error (if 1/2 nonces is solved). 8 years ago			`if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])`
diff: use the new function in all algos 9 years ago			`work_set_target_ratio(work, vhash64);`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`pdata[21] = secNonce;`
			`res++;`
			`}`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`pdata[19] = foundNonce;`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`return res;`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`} else {`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t) throughput + pdata[19] >= max_nonce) {`
			`pdata[19] = max_nonce;`
			`break;`
			`}`

Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`pdata[19] += throughput;`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`} while (!work_restart[thr_id].restart);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`extern "C" void free_s3(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`cudaFree(d_hash[thr_id]);`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`x11_simd512_cpu_free(thr_id);`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`cuda_check_cpu_free(thr_id);`
			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`}`