ccminer-gostd-lite/qubit/luffa.cu

/*
 * luffa 80 algo (Introduced by Doomcoin)
 */
extern "C" {
#include "sph/sph_luffa.h"
}

#include "miner.h"

#include "cuda_helper.h"

static uint32_t *d_hash[MAX_GPUS];

extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);

extern "C" void luffa_hash(void *state, const void *input)
{
	uint8_t _ALIGN(64) hash[64];

	sph_luffa512_context ctx_luffa;

	sph_luffa512_init(&ctx_luffa);
	sph_luffa512 (&ctx_luffa, input, 80);
	sph_luffa512_close(&ctx_luffa, (void*) hash);

	memcpy(state, hash, 32);
}

static bool init[MAX_GPUS] = { 0 };

extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t _ALIGN(64) endiandata[20];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);
	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		ptarget[7] = 0x0000f;

	if (!init[thr_id])
	{
		cudaSetDevice(device_map[thr_id]);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
			// reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
			CUDA_LOG_ERROR();
		}
		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));

		qubit_luffa512_cpu_init(thr_id, throughput);
		cuda_check_cpu_init(thr_id, throughput);

		init[thr_id] = true;
	}

	for (int k=0; k < 19; k++)
		be32enc(&endiandata[k], pdata[k]);

	qubit_luffa512_cpu_setBlock_80((void*)endiandata);
	cuda_check_cpu_setTarget(ptarget);

	do {
		qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], 0);

		*hashes_done = pdata[19] - first_nonce + throughput;

		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
		if (work->nonces[0] != UINT32_MAX)
		{
			const uint32_t Htarg = ptarget[7];
			uint32_t _ALIGN(64) vhash[8];
			be32enc(&endiandata[19], work->nonces[0]);
			luffa_hash(vhash, endiandata);

			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
				work->valid_nonces = 1;
				work_set_target_ratio(work, vhash);
				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
				if (work->nonces[1] != 0) {
					be32enc(&endiandata[19], work->nonces[1]);
					luffa_hash(vhash, endiandata);
					bn_set_target_ratio(work, vhash, 1);
					work->valid_nonces++;
					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
				} else {
					pdata[19] = work->nonces[0] + 1; // cursor
				}
				return work->valid_nonces;
			}
			else if (vhash[7] > Htarg) {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
				pdata[19] = work->nonces[0] + 1;
				continue;
			}
		}

		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}
		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
extern "C" void free_luffa(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	cudaFree(d_hash[thr_id]);

	cuda_check_cpu_free(thr_id);

	init[thr_id] = false;
	cudaDeviceSynchronize();
}
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`/*`
remove double reference to luffa algo doomcoin is dead but the luffa algo is still used (Joincoin).. keep doom as alias for compat... rename functions... 10 years ago			`* luffa 80 algo (Introduced by Doomcoin)`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`*/`
			`extern "C" {`
			`#include "sph/sph_luffa.h"`
			`}`

			`#include "miner.h"`

			`#include "cuda_helper.h"`

Handle a maximum of 16 gpus (vs 8 before) Some cards have 2 gpus on board... 10 years ago			`static uint32_t *d_hash[MAX_GPUS];`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`extern void qubit_luffa512_cpu_setBlock_80(void *pdata);`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
remove double reference to luffa algo doomcoin is dead but the luffa algo is still used (Joincoin).. keep doom as alias for compat... rename functions... 10 years ago			`extern "C" void luffa_hash(void state, const void input)`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`{`
qubit: implement cpu precalc (klaust) improve qubit (+5%) deep and doom (+10%) hashrate based on klausT code, simplified... 10 years ago			`uint8_t _ALIGN(64) hash[64];`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
qubit: implement cpu precalc (klaust) improve qubit (+5%) deep and doom (+10%) hashrate based on klausT code, simplified... 10 years ago			`sph_luffa512_context ctx_luffa;`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
			`sph_luffa512_init(&ctx_luffa);`
			`sph_luffa512 (&ctx_luffa, input, 80);`
			`sph_luffa512_close(&ctx_luffa, (void*) hash);`

			`memcpy(state, hash, 32);`
			`}`

Handle a maximum of 16 gpus (vs 8 before) Some cards have 2 gpus on board... 10 years ago			`static bool init[MAX_GPUS] = { 0 };`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
start v1.7, apply new prototypes to all algos 9 years ago			`extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`{`
qubit: implement cpu precalc (klaust) improve qubit (+5%) deep and doom (+10%) hashrate based on klausT code, simplified... 10 years ago			`uint32_t _ALIGN(64) endiandata[20];`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`const uint32_t first_nonce = pdata[19];`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);`
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
			`if (opt_benchmark)`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`ptarget[7] = 0x0000f;`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
			`if (!init[thr_id])`
			`{`
			`cudaSetDevice(device_map[thr_id]);`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`CUDA_LOG_ERROR();`
			`}`
Show intensity on init for all algos 8 years ago			`gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
qubit: implement cpu precalc (klaust) improve qubit (+5%) deep and doom (+10%) hashrate based on klausT code, simplified... 10 years ago			`qubit_luffa512_cpu_init(thr_id, throughput);`
			`cuda_check_cpu_init(thr_id, throughput);`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
			`init[thr_id] = true;`
			`}`

qubit: implement cpu precalc (klaust) improve qubit (+5%) deep and doom (+10%) hashrate based on klausT code, simplified... 10 years ago			`for (int k=0; k < 19; k++)`
remove uint32_t cast 10 years ago			`be32enc(&endiandata[k], pdata[k]);`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
qubit: implement cpu precalc (klaust) improve qubit (+5%) deep and doom (+10%) hashrate based on klausT code, simplified... 10 years ago			`qubit_luffa512_cpu_setBlock_80((void*)endiandata);`
			`cuda_check_cpu_setTarget(ptarget);`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
			`do {`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], 0);`
qubit: implement cpu precalc (klaust) improve qubit (+5%) deep and doom (+10%) hashrate based on klausT code, simplified... 10 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce + throughput;`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);`
			`if (work->nonces[0] != UINT32_MAX)`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`{`
migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`const uint32_t Htarg = ptarget[7];`
			`uint32_t _ALIGN(64) vhash[8];`
			`be32enc(&endiandata[19], work->nonces[0]);`
			`luffa_hash(vhash, endiandata);`

			`if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {`
			`work->valid_nonces = 1;`
			`work_set_target_ratio(work, vhash);`
			`work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);`
			`if (work->nonces[1] != 0) {`
			`be32enc(&endiandata[19], work->nonces[1]);`
			`luffa_hash(vhash, endiandata);`
			`bn_set_target_ratio(work, vhash, 1);`
			`work->valid_nonces++;`
			`pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;`
			`} else {`
			`pdata[19] = work->nonces[0] + 1; // cursor`
			`}`
			`return work->valid_nonces;`
			`}`
			`else if (vhash[7] > Htarg) {`
			`gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);`
			`pdata[19] = work->nonces[0] + 1;`
			`continue;`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`}`
			`}`

migrate 2nd nonce storage of most algos This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia... 8 years ago			`if ((uint64_t)throughput + pdata[19] >= max_nonce) {`
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`pdata[19] = max_nonce;`
Add cuda error checks on qubit algos And rename doom to luffa, like djm34 10 years ago			`break;`
			`}`
Enhance stale work detection + throughput fixes seems to resolve solo mining lock on share. export also computed solo work diff in api (not perfect) In high rate algos, throughput should be unsigned... This fixes keccak, blake and doom problems And change terminal color of debug lines, to be selectable in putty, color code is not supported in windows but selection is ok there. 10 years ago			`pdata[19] += throughput;`

Add cuda error checks on qubit algos And rename doom to luffa, like djm34 10 years ago			`} while (!work_restart[thr_id].restart);`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago
never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`*hashes_done = pdata[19] - first_nonce;`
Import djm34 qubit, deep and doom algos Indent, and put commonly used functions proto. in cuda_helper.h And add them to --cputest function Also change the color option to --nocolor, -C is no more needed Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> (Which is tired to remove these german copy/pasted comments) 10 years ago			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`extern "C" void free_luffa(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

various fixes for SM 2.1 and the benchmark X11+ algos and quark are not compatible for the moment but these ones are : Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB): blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr. blake : 70208.9 kH/s, 1 MB, 1048576 thr. bmw : 122802.6 kH/s, 65 MB, 2097152 thr. deep : 3533.6 kH/s, 33 MB, 524288 thr. fugue256 : 43177.9 kH/s, 17 MB, 524288 thr. heavy : 4118.2 kH/s, 147 MB, 524032 thr. keccak : 18673.1 kH/s, 129 MB, 2097152 thr. luffa : 28816.0 kH/s, 257 MB, 4194304 thr. lyra2 : 213.7 kH/s, 570 MB, 65536 thr. mjollnir : 3895.6 kH/s, 147 MB, 524032 thr. nist5 : 1101.4 kH/s, 67 MB, 1048576 thr. penta : 501.6 kH/s, 21 MB, 327680 thr. skein : 5432.4 kH/s, 65 MB, 1048576 thr. skein2 : 6788.9 kH/s, 33 MB, 524288 thr. whirlpool : 688.5 kH/s, 33 MB, 524288 thr. zr5 : 122.5 kH/s, 86 MB, 262144 thr. 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`cudaFree(d_hash[thr_id]);`

			`cuda_check_cpu_free(thr_id);`

warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`init[thr_id] = false;`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`cudaDeviceSynchronize();`
warn on cuda errors + various small changes The full benchmark can now be launched with "ccminer --benchmark" add a new helper function which log a warning with last cuda error (not shown with the quiet option) : CUDA_LOG_ERROR(); it can be used where miner.h is included (.c/.cpp/.cu) fix x14 (in ccminer.cpp), a break was missing in switch..case 9 years ago			`}`