ccminer-gostd-lite/Algo256/bmw.cu

/**
 * bmw-256 MDT
 * tpruvot - 2015
 */
extern "C" {
#include "sph/sph_bmw.h"
}

#include <miner.h>
#include <cuda_helper.h>

static uint32_t *d_hash[MAX_GPUS];

extern void bmw256_midstate_init(int thr_id, uint32_t threads);
extern void bmw256_midstate_free(int thr_id);
extern void bmw256_setBlock_80(int thr_id, void *pdata);
extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap);

extern uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);

// CPU Hash
extern "C" void bmw_hash(void *state, const void *input)
{
	uint32_t _ALIGN(64) hash[16];
	sph_bmw256_context ctx;

	sph_bmw256_init(&ctx);
	sph_bmw256(&ctx, input, 80);
	sph_bmw256_close(&ctx, (void*) hash);

	memcpy(state, hash, 32);
}

static bool init[MAX_GPUS] = { 0 };

static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
	return iftrue ? swab32(val) : val;
}

extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
	uint32_t _ALIGN(64) endiandata[20];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;
	const uint32_t first_nonce = pdata[19];
	bool swapnonce = true;
	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);
	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

	if (opt_benchmark)
		ptarget[7] = 0x0005;

	if (!init[thr_id]) {
		cudaSetDevice(device_map[thr_id]);
		if (opt_cudaschedule == -1 && gpu_threads == 1) {
			cudaDeviceReset();
                        // reduce cpu usage
			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
		}
		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

		cuda_check_cpu_init(thr_id, throughput);
		bmw256_midstate_init(thr_id, throughput);

		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));

		init[thr_id] = true;
	}

	for (int k=0; k < 20; k++) {
		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
	}

	cudaGetLastError();
	bmw256_setBlock_80(thr_id, (void*)endiandata);

	cuda_check_cpu_setTarget(ptarget);

	do {
		bmw256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], (int) swapnonce);
		uint32_t foundNonce = cuda_check_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
		if (foundNonce != UINT32_MAX)
		{
			uint32_t _ALIGN(64) vhash64[8];
			endiandata[19] = swab32_if(foundNonce, swapnonce);
			bmw_hash(vhash64, endiandata);

			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
				*hashes_done = foundNonce - first_nonce + 1;
				pdata[19] = swab32_if(foundNonce,!swapnonce);
				work_set_target_ratio(work, vhash64);
				return 1;
			}
			else {
				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
			}
		}

		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
			pdata[19] = max_nonce;
			break;
		}

		pdata[19] += throughput;

	} while (!work_restart[thr_id].restart);

	*hashes_done = pdata[19] - first_nonce;
	return 0;
}

// cleanup
extern "C" void free_bmw(int thr_id)
{
	if (!init[thr_id])
		return;

	cudaThreadSynchronize();

	cudaFree(d_hash[thr_id]);
	bmw256_midstate_free(thr_id);
	cuda_check_cpu_free(thr_id);

	cudaDeviceSynchronize();
	init[thr_id] = false;
}
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`/**`
			`* bmw-256 MDT`
			`* tpruvot - 2015`
			`*/`
			`extern "C" {`
			`#include "sph/sph_bmw.h"`
			`}`

win32: implement a nvapi.dll wrapper like nvml Allow to get/set missing infos like the power limit on x86 squashed for a better min/max and device mapping Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`#include <miner.h>`
			`#include <cuda_helper.h>`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago
			`static uint32_t *d_hash[MAX_GPUS];`

			`extern void bmw256_midstate_init(int thr_id, uint32_t threads);`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`extern void bmw256_midstate_free(int thr_id);`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`extern void bmw256_setBlock_80(int thr_id, void *pdata);`
			`extern void bmw256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int swap);`

			`extern uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);`

			`// CPU Hash`
			`extern "C" void bmw_hash(void state, const void input)`
			`{`
			`uint32_t _ALIGN(64) hash[16];`
			`sph_bmw256_context ctx;`

			`sph_bmw256_init(&ctx);`
			`sph_bmw256(&ctx, input, 80);`
			`sph_bmw256_close(&ctx, (void*) hash);`

			`memcpy(state, hash, 32);`
			`}`

			`static bool init[MAX_GPUS] = { 0 };`

			`static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {`
			`return iftrue ? swab32(val) : val;`
			`}`

start v1.7, apply new prototypes to all algos 9 years ago			`extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`{`
start v1.7, apply new prototypes to all algos 9 years ago			`uint32_t _ALIGN(64) endiandata[20];`
			`uint32_t *pdata = work->data;`
			`uint32_t *ptarget = work->target;`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`const uint32_t first_nonce = pdata[19];`
			`bool swapnonce = true;`
intensity: do not reduce throughput before init Else the memory allocated could be less than required later btw, use the new "cuda" function to apply intensity/throughput 9 years ago			`uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);`
			`if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago
			`if (opt_benchmark)`
benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`ptarget[7] = 0x0005;`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago
			`if (!init[thr_id]) {`
			`cudaSetDevice(device_map[thr_id]);`
Show intensity on init for all algos 8 years ago			`if (opt_cudaschedule == -1 && gpu_threads == 1) {`
			`cudaDeviceReset();`
			`// reduce cpu usage`
			`cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);`
			`}`
			`gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago
			`cuda_check_cpu_init(thr_id, throughput);`
			`bmw256_midstate_init(thr_id, throughput);`

			`CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));`

			`init[thr_id] = true;`
			`}`

			`for (int k=0; k < 20; k++) {`
			`be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);`
			`}`

benchmark: store all algos results + cuda fixes Note: lyra2, lyra2v2 and script seems to have problems to coexist with other algos... to run after some of them... moved lyra2 first and skip scrypt/jane for the moment... Only stored in memory for now.. to display a table after the bench ccminer -a auto --benchmark Results may be exported later to a json file... 9 years ago			`cudaGetLastError();`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`bmw256_setBlock_80(thr_id, (void*)endiandata);`

			`cuda_check_cpu_setTarget(ptarget);`

			`do {`
			`bmw256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], (int) swapnonce);`
			`uint32_t foundNonce = cuda_check_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);`
			`if (foundNonce != UINT32_MAX)`
			`{`
			`uint32_t _ALIGN(64) vhash64[8];`
			`endiandata[19] = swab32_if(foundNonce, swapnonce);`
			`bmw_hash(vhash64, endiandata);`

			`if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {`
			`*hashes_done = foundNonce - first_nonce + 1;`
			`pdata[19] = swab32_if(foundNonce,!swapnonce);`
diff: use the new function in all algos 9 years ago			`work_set_target_ratio(work, vhash64);`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`return 1;`
			`}`
			`else {`
benchmark: enhance the mem leak detection reduce "false" warnings, and ignore unrelated/small ones <= 1 MB On windows the gpu memory can be allocated by other processes + some cleanup in algos... (free/gpulog) 9 years ago			`gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`}`
			`}`

never interrupt global benchmark with found nonces fix some algo weird hashrates (like blake) and reset device between algos, for better accuracy but this reset doesnt seems enough to bench all algos correctly... to test on linux, could be a driver issue... heavy: fix first alloc and indent with tabs... 9 years ago			`if ((uint64_t) throughput + pdata[19] >= max_nonce) {`
bmw algo for MDT, with midstate which could be extracted from json too replace a satcoin by another one ;) Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 9 years ago			`pdata[19] = max_nonce;`
			`break;`
			`}`

			`pdata[19] += throughput;`

			`} while (!work_restart[thr_id].restart);`

			`*hashes_done = pdata[19] - first_nonce;`
			`return 0;`
			`}`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`// cleanup`
			`extern "C" void free_bmw(int thr_id)`
			`{`
			`if (!init[thr_id])`
			`return;`

various fixes for SM 2.1 and the benchmark X11+ algos and quark are not compatible for the moment but these ones are : Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB): blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr. blake : 70208.9 kH/s, 1 MB, 1048576 thr. bmw : 122802.6 kH/s, 65 MB, 2097152 thr. deep : 3533.6 kH/s, 33 MB, 524288 thr. fugue256 : 43177.9 kH/s, 17 MB, 524288 thr. heavy : 4118.2 kH/s, 147 MB, 524032 thr. keccak : 18673.1 kH/s, 129 MB, 2097152 thr. luffa : 28816.0 kH/s, 257 MB, 4194304 thr. lyra2 : 213.7 kH/s, 570 MB, 65536 thr. mjollnir : 3895.6 kH/s, 147 MB, 524032 thr. nist5 : 1101.4 kH/s, 67 MB, 1048576 thr. penta : 501.6 kH/s, 21 MB, 327680 thr. skein : 5432.4 kH/s, 65 MB, 1048576 thr. skein2 : 6788.9 kH/s, 33 MB, 524288 thr. whirlpool : 688.5 kH/s, 33 MB, 524288 thr. zr5 : 122.5 kH/s, 86 MB, 262144 thr. 9 years ago			`cudaThreadSynchronize();`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago
			`cudaFree(d_hash[thr_id]);`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago			`bmw256_midstate_free(thr_id);`
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`cuda_check_cpu_free(thr_id);`
algos: free allocated mem for algo switch All can be freed propertly now, except script (reset) and lyra2 (leak) 9 years ago
algos: add functions to free allocated resources Will be used later for algo switching not really tested yet... 9 years ago			`cudaDeviceSynchronize();`
various fixes for SM 2.1 and the benchmark X11+ algos and quark are not compatible for the moment but these ones are : Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB): blakecoin : 159090.5 kH/s, 1 MB, 1048576 thr. blake : 70208.9 kH/s, 1 MB, 1048576 thr. bmw : 122802.6 kH/s, 65 MB, 2097152 thr. deep : 3533.6 kH/s, 33 MB, 524288 thr. fugue256 : 43177.9 kH/s, 17 MB, 524288 thr. heavy : 4118.2 kH/s, 147 MB, 524032 thr. keccak : 18673.1 kH/s, 129 MB, 2097152 thr. luffa : 28816.0 kH/s, 257 MB, 4194304 thr. lyra2 : 213.7 kH/s, 570 MB, 65536 thr. mjollnir : 3895.6 kH/s, 147 MB, 524032 thr. nist5 : 1101.4 kH/s, 67 MB, 1048576 thr. penta : 501.6 kH/s, 21 MB, 327680 thr. skein : 5432.4 kH/s, 65 MB, 1048576 thr. skein2 : 6788.9 kH/s, 33 MB, 524288 thr. whirlpool : 688.5 kH/s, 33 MB, 524288 thr. zr5 : 122.5 kH/s, 86 MB, 262144 thr. 9 years ago			`init[thr_id] = false;`
			`}`