ccminer-gostd-lite/x11/s3.cu

/**
 * S3 Hash (Also called 3S - Used by 1Coin)
 */

extern "C" {
#include "sph/sph_skein.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
}

#include "miner.h"
#include "cuda_helper.h"

#include <stdint.h>

static uint32_t *d_hash[MAX_GPUS];

extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);
extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void x11_shavite512_setBlock_80(void *pdata);

extern int  x11_simd512_cpu_init(int thr_id, uint32_t threads);
extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

/* CPU HASH */
extern "C" void s3hash(void *output, const void *input)
{
	sph_shavite512_context ctx_shavite;
	sph_simd512_context ctx_simd;
	sph_skein512_context ctx_skein;

	unsigned char hash[64];

	sph_shavite512_init(&ctx_shavite);
	sph_shavite512(&ctx_shavite, input, 80);
	sph_shavite512_close(&ctx_shavite, (void*) hash);

	sph_simd512_init(&ctx_simd);
	sph_simd512(&ctx_simd, (const void*) hash, 64);
	sph_simd512_close(&ctx_simd, (void*) hash);

	sph_skein512_init(&ctx_skein);
	sph_skein512(&ctx_skein, (const void*) hash, 64);
	sph_skein512_close(&ctx_skein, (void*) hash);

	memcpy(output, hash, 32);
}

static bool init[MAX_GPUS] = { 0 };

/* Main S3 entry point */
extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	int intensity = 20; // 256*256*8*2;
#ifdef WIN32
	// reduce by one the intensity on windows
	intensity--;
#endif
	uint32_t throughput =  device_intensity(thr_id, __func__, 1 << intensity);
	throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		((uint32_t*)ptarget)[7] = 0xF;

	if (!init[thr_id])
	{
		cudaSetDevice(device_map[thr_id]);

		x11_shavite512_cpu_init(thr_id, throughput);
		x11_simd512_cpu_init(thr_id, throughput);
		quark_skein512_cpu_init(thr_id, throughput);

		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);

		cuda_check_cpu_init(thr_id, throughput);

		init[thr_id] = true;
	}

	uint32_t endiandata[20];
	for (int k=0; k < 20; k++)
		be32enc(&endiandata[k], pdata[k]);

	x11_shavite512_setBlock_80((void*)endiandata);
	cuda_check_cpu_setTarget(ptarget);

	do {
		const uint32_t Htarg = ptarget[7];
		uint32_t foundNonce;
		int order = 0;

		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

		*hashes_done = pdata[19] - first_nonce + throughput;

		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

		if (foundNonce != UINT32_MAX)
		{
			uint32_t vhash64[8];
			be32enc(&endiandata[19], foundNonce);
			s3hash(vhash64, endiandata);

			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
				int res = 1;
				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
				bn_store_hash_target_ratio(vhash64, ptarget, work);
				if (secNonce != 0) {
					be32enc(&endiandata[19], secNonce);
					s3hash(vhash64, endiandata);
					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
						bn_store_hash_target_ratio(vhash64, ptarget, work);
					pdata[21] = secNonce;
					res++;
				}
				pdata[19] = foundNonce;
				return res;

			} else {
				applog(LOG_WARNING, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce);
			}
		}

		pdata[19] += throughput;

	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce + 1;
	return 0;
}

// cleanup
extern "C" void free_s3(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaSetDevice(device_map[thr_id]);

	cudaFree(d_hash[thr_id]);

	cuda_check_cpu_free(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`/**`
			`* S3 Hash (Also called 3S - Used by 1Coin)`
			`*/`

			`extern "C" {`
			`#include "sph/sph_skein.h"`
			`#include "sph/sph_shavite.h"`
			`#include "sph/sph_simd.h"`
			`}`

			`#include "miner.h"`
			`#include "cuda_helper.h"`

			`#include <stdint.h>`

Handle a maximum of 16 gpus (vs 8 before) Some cards have 2 gpus on board... 10 years ago			`static uint32_t *d_hash[MAX_GPUS];`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads);`
			`extern void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`extern void x11_shavite512_setBlock_80(void *pdata);`

cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`extern int x11_simd512_cpu_init(int thr_id, uint32_t threads);`
			`extern void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t d_nonceVector, uint32_t d_hash, int order);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);`
			`extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t d_nonceVector, uint32_t d_hash, int order);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`/* CPU HASH */`
			`extern "C" void s3hash(void output, const void input)`
			`{`
			`sph_shavite512_context ctx_shavite;`
			`sph_simd512_context ctx_simd;`
			`sph_skein512_context ctx_skein;`

			`unsigned char hash[64];`

			`sph_shavite512_init(&ctx_shavite);`
			`sph_shavite512(&ctx_shavite, input, 80);`
			`sph_shavite512_close(&ctx_shavite, (void*) hash);`

			`sph_simd512_init(&ctx_simd);`
			`sph_simd512(&ctx_simd, (const void*) hash, 64);`
			`sph_simd512_close(&ctx_simd, (void*) hash);`

			`sph_skein512_init(&ctx_skein);`
			`sph_skein512(&ctx_skein, (const void*) hash, 64);`
			`sph_skein512_close(&ctx_skein, (void*) hash);`

			`memcpy(output, hash, 32);`
			`}`

Handle a maximum of 16 gpus (vs 8 before) Some cards have 2 gpus on board... 10 years ago			`static bool init[MAX_GPUS] = { 0 };`
various small changes heavy: reduce by 256 threads default intensity to all -i 20 cuda: put static thread init bools outside the code (made once) api: fix nvml header to build without 10 years ago
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`/* Main S3 entry point */`
start v1.7, apply new prototypes to all algos 9 years ago			`extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`{`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`const uint32_t first_nonce = pdata[19];`
intensity: sign warnings fixes min(i,u) 10 years ago			`int intensity = 20; // 2562568*2;`
Prepare trap of hardware/mem failures 10 years ago			`#ifdef WIN32`
			`// reduce by one the intensity on windows`
			`intensity--;`
s3: reduce a bit the intensity on windows 10 years ago			`#endif`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t throughput = device_intensity(thr_id, __func__, 1 << intensity);`
			`throughput = min(throughput, max_nonce - first_nonce);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`if (opt_benchmark)`
			`((uint32_t*)ptarget)[7] = 0xF;`

			`if (!init[thr_id])`
			`{`
Prepare trap of hardware/mem failures 10 years ago			`cudaSetDevice(device_map[thr_id]);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`x11_shavite512_cpu_init(thr_id, throughput);`
			`x11_simd512_cpu_init(thr_id, throughput);`
			`quark_skein512_cpu_init(thr_id, throughput);`

Prepare trap of hardware/mem failures 10 years ago			`CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);`

Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`cuda_check_cpu_init(thr_id, throughput);`

			`init[thr_id] = true;`
			`}`

			`uint32_t endiandata[20];`
			`for (int k=0; k < 20; k++)`
remove uint32_t cast 10 years ago			`be32enc(&endiandata[k], pdata[k]);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`x11_shavite512_setBlock_80((void*)endiandata);`
			`cuda_check_cpu_setTarget(ptarget);`

			`do {`
			`const uint32_t Htarg = ptarget[7];`
			`uint32_t foundNonce;`
			`int order = 0;`

			`x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);`
			`x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
			`quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`

start v1.7, apply new prototypes to all algos 9 years ago			`*hashes_done = pdata[19] - first_nonce + throughput;`

checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`if (foundNonce != UINT32_MAX)`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`{`
			`uint32_t vhash64[8];`
			`be32enc(&endiandata[19], foundNonce);`
			`s3hash(vhash64, endiandata);`

			`if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`int res = 1;`
			`uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);`
start v1.7, apply new prototypes to all algos 9 years ago			`bn_store_hash_target_ratio(vhash64, ptarget, work);`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`if (secNonce != 0) {`
start v1.7, apply new prototypes to all algos 9 years ago			`be32enc(&endiandata[19], secNonce);`
			`s3hash(vhash64, endiandata);`
			`if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)`
			`bn_store_hash_target_ratio(vhash64, ptarget, work);`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`pdata[21] = secNonce;`
			`res++;`
			`}`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`pdata[19] = foundNonce;`
Check and submit multiple nonces in one loop Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`return res;`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago
			`} else {`
warnings: use the right device id (device_map[thr_id]) 10 years ago			`applog(LOG_WARNING, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce);`
Add S3 Algo (1Coin) Simple addition of the algo using existing X11 code 10 years ago			`}`
			`}`

			`pdata[19] += throughput;`

			`} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);`

			`*hashes_done = pdata[19] - first_nonce + 1;`
			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`extern "C" void free_s3(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

			`cudaSetDevice(device_map[thr_id]);`

			`cudaFree(d_hash[thr_id]);`

			`cuda_check_cpu_free(thr_id);`
			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
			`}`