ccminer-gostd-lite/skein2.cpp

/**
 * SKEIN512 80 + SKEIN512 64 (Woodcoin)
 * by tpruvot@github - 2015
 */
#include <string.h>

#include "sph/sph_skein.h"

#include "miner.h"
#include "cuda_helper.h"

static uint32_t *d_hash[MAX_GPUS];

extern void skein512_cpu_setBlock_80(void *pdata);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);

extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

void skein2hash(void *output, const void *input)
{
	uint32_t _ALIGN(64) hash[16];
	sph_skein512_context ctx_skein;

	sph_skein512_init(&ctx_skein);
	sph_skein512(&ctx_skein, input, 80);
	sph_skein512_close(&ctx_skein, hash);

	sph_skein512_init(&ctx_skein);
	sph_skein512(&ctx_skein, hash, 64);
	sph_skein512_close(&ctx_skein, hash);

	memcpy(output, (void*) hash, 32);
}

static bool init[MAX_GPUS] = { 0 };

static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
	return iftrue ? swab32(val) : val;
}

int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	int dev_id = device_map[thr_id];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	const int swap = 1; // to toggle nonce endian

	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		((uint32_t*)ptarget)[7] = 0;

	if (!init[thr_id])
	{
		cudaSetDevice(dev_id);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}
		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);

		quark_skein512_cpu_init(thr_id, throughput);
		cuda_check_cpu_init(thr_id, throughput);

		CUDA_SAFE_CALL(cudaDeviceSynchronize());

		init[thr_id] = true;
	}

	uint32_t endiandata[20];
	for (int k=0; k < 19; k++)
		be32enc(&endiandata[k], pdata[k]);

	skein512_cpu_setBlock_80((void*)endiandata);
	cuda_check_cpu_setTarget(ptarget);

	do {
		int order = 0;

		// Hash with CUDA
		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

		*hashes_done = pdata[19] - first_nonce + throughput;

		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
		if (foundNonce != UINT32_MAX)
		{
			uint32_t _ALIGN(64) vhash64[8];

			endiandata[19] = swab32_if(foundNonce, swap);
			skein2hash(vhash64, endiandata);

			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
				int res = 1;
				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
				work_set_target_ratio(work, vhash64);
				if (secNonce != 0) {
					if (!opt_quiet)
						applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce));

					endiandata[19] = swab32_if(secNonce, swap);
					skein2hash(vhash64, endiandata);
					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
						work_set_target_ratio(work, vhash64);
					pdata[21] = swab32_if(secNonce, !swap);
					res++;
				}
				pdata[19] = swab32_if(foundNonce, !swap);
				return res;
			} else {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
			}
		}

		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;

	return 0;
}

// cleanup
void free_skein2(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	cudaFree(d_hash[thr_id]);

	cuda_check_cpu_free(thr_id);
	init[thr_id] = false;

	cudaDeviceSynchronize();
}
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`/**`
			`* SKEIN512 80 + SKEIN512 64 (Woodcoin)`
			`* by tpruvot@github - 2015`
			`*/`
rename skein2 to c++, no cuda kernel code and some other changes... 10 years ago			`#include <string.h>`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`#include "sph/sph_skein.h"`

			`#include "miner.h"`
			`#include "cuda_helper.h"`

			`static uint32_t *d_hash[MAX_GPUS];`

			`extern void skein512_cpu_setBlock_80(void *pdata);`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t d_nonceVector, uint32_t d_hash, int order);`

rename skein2 to c++, no cuda kernel code and some other changes... 10 years ago			`void skein2hash(void output, const void input)`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`{`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`uint32_t _ALIGN(64) hash[16];`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`sph_skein512_context ctx_skein;`

			`sph_skein512_init(&ctx_skein);`
			`sph_skein512(&ctx_skein, input, 80);`
			`sph_skein512_close(&ctx_skein, hash);`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`sph_skein512_init(&ctx_skein);`
			`sph_skein512(&ctx_skein, hash, 64);`
			`sph_skein512_close(&ctx_skein, hash);`

rename skein2 to c++, no cuda kernel code and some other changes... 10 years ago			`memcpy(output, (void*) hash, 32);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`}`

			`static bool init[MAX_GPUS] = { 0 };`

fix duplicates on skein2 and blake2s (nonce endian) 9 years ago			`static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {`
			`return iftrue ? swab32(val) : val;`
			`}`

start v1.7, apply new prototypes to all algos 9 years ago			`int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`{`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`int dev_id = device_map[thr_id];`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`const uint32_t first_nonce = pdata[19];`
fix duplicates on skein2 and blake2s (nonce endian) 9 years ago			`const int swap = 1; // to toggle nonce endian`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 2562568`
			`if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
			`if (opt_benchmark)`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`((uint32_t*)ptarget)[7] = 0;`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
			`if (!init[thr_id])`
			`{`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`cudaSetDevice(dev_id);`
1.7.1 release set schedule flags to reduce linux cpu usage without MyStreamSynchronize() 9 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`CUDA_LOG_ERROR();`
			`}`
Show intensity on init for all algos 8 years ago			`gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);`
Skein/Skein2 SM 3.0 devices support + code cleanup Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
reset: take care of multi-threaded gpus (-d 0,0) to be tested... could create problems when reset in a chain like x11... 10 years ago			`quark_skein512_cpu_init(thr_id, throughput);`
			`cuda_check_cpu_init(thr_id, throughput);`
Skein/Skein2 SM 3.0 devices support + code cleanup Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
reset: take care of multi-threaded gpus (-d 0,0) to be tested... could create problems when reset in a chain like x11... 10 years ago			`CUDA_SAFE_CALL(cudaDeviceSynchronize());`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
			`init[thr_id] = true;`
			`}`

			`uint32_t endiandata[20];`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`for (int k=0; k < 19; k++)`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`be32enc(&endiandata[k], pdata[k]);`

			`skein512_cpu_setBlock_80((void*)endiandata);`
			`cuda_check_cpu_setTarget(ptarget);`

			`do {`
			`int order = 0;`

			`// Hash with CUDA`
fix duplicates on skein2 and blake2s (nonce endian) 9 years ago			`skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago
			`*hashes_done = pdata[19] - first_nonce + throughput;`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
			`uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);`
			`if (foundNonce != UINT32_MAX)`
			`{`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`uint32_t _ALIGN(64) vhash64[8];`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
fix duplicates on skein2 and blake2s (nonce endian) 9 years ago			`endiandata[19] = swab32_if(foundNonce, swap);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`skein2hash(vhash64, endiandata);`

update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`int res = 1;`
			`uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);`
diff: use the new function in all algos 9 years ago			`work_set_target_ratio(work, vhash64);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`if (secNonce != 0) {`
			`if (!opt_quiet)`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`applog(LOG_BLUE, "GPU #%d: found second nonce %08x !", dev_id, swab32(secNonce));`
start v1.7, apply new prototypes to all algos 9 years ago
fix duplicates on skein2 and blake2s (nonce endian) 9 years ago			`endiandata[19] = swab32_if(secNonce, swap);`
start v1.7, apply new prototypes to all algos 9 years ago			`skein2hash(vhash64, endiandata);`
			`if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)`
diff: use the new function in all algos 9 years ago			`work_set_target_ratio(work, vhash64);`
fix duplicates on skein2 and blake2s (nonce endian) 9 years ago			`pdata[21] = swab32_if(secNonce, !swap);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`res++;`
			`}`
fix duplicates on skein2 and blake2s (nonce endian) 9 years ago			`pdata[19] = swab32_if(foundNonce, !swap);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`return res;`
warnings: use the right device id (device_map[thr_id]) 10 years ago			`} else {`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t) throughput + pdata[19] >= max_nonce) {`
rename skein2 to c++, no cuda kernel code and some other changes... 10 years ago			`pdata[19] = max_nonce;`
update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`break;`
			`}`

skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`pdata[19] += throughput;`

update README, small changes, prepare release 1.6.1 still need a SM 3.0 fix for skein... 10 years ago			`} while (!work_restart[thr_id].restart);`
skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`

skein2 algo for woodcoin Also known as Double Skein Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`void free_skein2(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`cudaFree(d_hash[thr_id]);`

			`cuda_check_cpu_free(thr_id);`
			`init[thr_id] = false;`

			`cudaDeviceSynchronize();`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`}`