ccminer-gostd-lite/cuda_checkhash.cu

/**
 * This code compares final hash against target
 */
#include <stdio.h>
#include <memory.h>

#include "miner.h"

#include "cuda_helper.h"

__constant__ uint32_t pTarget[8]; // 32 bytes

// store MAX_GPUS device arrays of 8 nonces
static uint32_t* h_resNonces[MAX_GPUS];
static uint32_t* d_resNonces[MAX_GPUS];

__host__
void cuda_check_cpu_init(int thr_id, uint32_t threads)
{
    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));
    CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t)));
}

// Target Difficulty
__host__
void cuda_check_cpu_setTarget(const void *ptarget)
{
	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
}

/* --------------------------------------------------------------------------------------------- */

__device__ __forceinline__
static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
{
	if (hash[7] > target[7])
		return false;
	if (hash[7] < target[7])
		return true;
	if (hash[6] > target[6])
		return false;
	if (hash[6] < target[6])
		return true;

	if (hash[5] > target[5])
		return false;
	if (hash[5] < target[5])
		return true;
	if (hash[4] > target[4])
		return false;
	if (hash[4] < target[4])
		return true;

	if (hash[3] > target[3])
		return false;
	if (hash[3] < target[3])
		return true;
	if (hash[2] > target[2])
		return false;
	if (hash[2] < target[2])
		return true;

	if (hash[1] > target[1])
		return false;
	if (hash[1] < target[1])
		return true;
	if (hash[0] > target[0])
		return false;

	return true;
}

__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{
	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
		// shl 4 = *16 x 4 (uint32) = 64 bytes
		// todo: use only 32 bytes * threads if possible
		uint32_t *inpHash = &hash[thread << 4];

		if (resNonces[0] == UINT32_MAX) {
			if (hashbelowtarget(inpHash, pTarget))
				resNonces[0] = (startNounce + thread);
		}
	}
}

__host__
uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)
{
	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));

	const uint32_t threadsperblock = 512;

	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
	dim3 block(threadsperblock);

	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
	cudaThreadSynchronize();

	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
	return h_resNonces[thr_id][0];
}

/* --------------------------------------------------------------------------------------------- */

__global__ __launch_bounds__(512, 4)
void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
{
	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

	uint32_t *inpHash = &hash[thread << 4];

	if (hashbelowtarget(inpHash, pTarget)) {
		int resNum = ++resNonces[0];
		__threadfence();
		if (resNum < 8)
			resNonces[resNum] = (startNounce + thread);
	}
}

__host__
uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
{
	uint32_t rescnt, result = 0;

	const uint32_t threadsperblock = 512;
	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
	dim3 block(threadsperblock);

	// first element stores the count of found nonces
	cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));

	cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
	cudaThreadSynchronize();

	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
	rescnt = h_resNonces[thr_id][0];
	if (rescnt > numNonce) {
		if (numNonce <= rescnt) {
			result = h_resNonces[thr_id][numNonce+1];
		}
		if (opt_debug)
			applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);
	}

	return result;
}

/* --------------------------------------------------------------------------------------------- */

__global__
void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{
	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
		uint32_t nounce = g_nonceVector[thread];
		uint32_t hashPosition = (nounce - startNounce) << 4;
		uint32_t *inpHash = &g_hash[hashPosition];

		for (int i = 7; i >= 0; i--) {
			if (inpHash[i] > pTarget[i]) {
				return;
			}
			if (inpHash[i] < pTarget[i]) {
				break;
			}
		}
		if (resNounce[0] > nounce)
			resNounce[0] = nounce;
	}
}

__host__
uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
	uint32_t result = 0xffffffff;
	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));

	const uint32_t threadsperblock = 256;

	dim3 grid((threads + threadsperblock-1)/threadsperblock);
	dim3 block(threadsperblock);

	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);

	MyStreamSynchronize(NULL, order, thr_id);

	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

	cudaThreadSynchronize();
	result = *h_resNonces[thr_id];

	return result;
}

/* Function to get the compiled Shader Model version */
int cuda_arch[MAX_GPUS] = { 0 };
__global__
void nvcc_get_arch(int *d_version)
{
#ifdef __CUDA_ARCH__
	*d_version = __CUDA_ARCH__;
#endif
}

__host__
int cuda_get_arch(int thr_id)
{
	int *d_version;
	int dev_id = device_map[thr_id];
	if (cuda_arch[dev_id] == 0) {
		// only do it once...
		cudaMalloc(&d_version, sizeof(int));
		nvcc_get_arch <<< 1, 1 >>> (d_version);
		cudaMemcpy(&cuda_arch[dev_id], d_version, sizeof(int), cudaMemcpyDeviceToHost);
		cudaFree(d_version);
	}
	return cuda_arch[dev_id];
}
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`/**`
			`* This code compares final hash against target`
			`*/`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`#include <stdio.h>`
			`#include <memory.h>`

Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`#include "miner.h"`

Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`#include "cuda_helper.h"`

Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`__constant__ uint32_t pTarget[8]; // 32 bytes`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
Handle a maximum of 16 gpus (vs 8 before) Some cards have 2 gpus on board... 10 years ago			`// store MAX_GPUS device arrays of 8 nonces`
			`static uint32_t* h_resNonces[MAX_GPUS];`
			`static uint32_t* d_resNonces[MAX_GPUS];`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
Move common check_cpu functions to root 10 years ago			`__host__`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`void cuda_check_cpu_init(int thr_id, uint32_t threads)`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`{`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));`
			`CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t)));`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`}`

Prepare trap of hardware/mem failures 10 years ago			`// Target Difficulty`
Move common check_cpu functions to root 10 years ago			`__host__`
			`void cuda_check_cpu_setTarget(const void *ptarget)`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`{`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago			`CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`}`

checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`/* --------------------------------------------------------------------------------------------- */`

			`__device__ __forceinline__`
			`static bool hashbelowtarget(const uint32_t const __restrict__ hash, const uint32_t const __restrict__ target)`
			`{`
			`if (hash[7] > target[7])`
			`return false;`
			`if (hash[7] < target[7])`
			`return true;`
			`if (hash[6] > target[6])`
			`return false;`
			`if (hash[6] < target[6])`
			`return true;`

			`if (hash[5] > target[5])`
			`return false;`
			`if (hash[5] < target[5])`
			`return true;`
			`if (hash[4] > target[4])`
			`return false;`
			`if (hash[4] < target[4])`
			`return true;`

			`if (hash[3] > target[3])`
			`return false;`
			`if (hash[3] < target[3])`
			`return true;`
			`if (hash[2] > target[2])`
			`return false;`
			`if (hash[2] < target[2])`
			`return true;`

			`if (hash[1] > target[1])`
			`return false;`
			`if (hash[1] < target[1])`
			`return true;`
			`if (hash[0] > target[0])`
			`return false;`

			`return true;`
			`}`

			`__global__ __launch_bounds__(512, 4)`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`void cuda_checkhash_64(uint32_t threads, uint32_t startNounce, uint32_t hash, uint32_t resNonces)`
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`{`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);`
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`if (thread < threads)`
			`{`
			`// shl 4 = *16 x 4 (uint32) = 64 bytes`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`// todo: use only 32 bytes * threads if possible`
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`uint32_t *inpHash = &hash[thread << 4];`

Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`if (resNonces[0] == UINT32_MAX) {`
			`if (hashbelowtarget(inpHash, pTarget))`
			`resNonces[0] = (startNounce + thread);`
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`}`
			`}`
			`}`

Move common check_cpu functions to root 10 years ago			`__host__`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash)`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`{`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`const uint32_t threadsperblock = 512;`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`dim3 grid((threads + threadsperblock - 1) / threadsperblock);`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago			`dim3 block(threadsperblock);`

Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);`
			`cudaThreadSynchronize();`

			`cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);`
			`return h_resNonces[thr_id][0];`
			`}`

			`/* --------------------------------------------------------------------------------------------- */`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`__global__ __launch_bounds__(512, 4)`
			`void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t hash, uint32_t resNonces)`
			`{`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago
			`uint32_t *inpHash = &hash[thread << 4];`

			`if (hashbelowtarget(inpHash, pTarget)) {`
			`int resNum = ++resNonces[0];`
			`__threadfence();`
			`if (resNum < 8)`
			`resNonces[resNum] = (startNounce + thread);`
			`}`
			`}`

			`__host__`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`{`
			`uint32_t rescnt, result = 0;`

cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`const uint32_t threadsperblock = 512;`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`dim3 grid((threads + threadsperblock - 1) / threadsperblock);`
			`dim3 block(threadsperblock);`

			`// first element stores the count of found nonces`
			`cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));`

			`cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);`
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`cudaThreadSynchronize();`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);`
			`rescnt = h_resNonces[thr_id][0];`
			`if (rescnt > numNonce) {`
			`if (numNonce <= rescnt) {`
			`result = h_resNonces[thr_id][numNonce+1];`
			`}`
			`if (opt_debug)`
			`applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);`
			`}`
Revision 0.6 with myriad-groestl and jackpot coin 11 years ago
			`return result;`
			`}`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`/* --------------------------------------------------------------------------------------------- */`

checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago			`__global__`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`void cuda_check_hash_branch_64(uint32_t threads, uint32_t startNounce, uint32_t g_nonceVector, uint32_t g_hash, uint32_t *resNounce)`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago			`{`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago			`if (thread < threads)`
			`{`
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`uint32_t nounce = g_nonceVector[thread];`
			`uint32_t hashPosition = (nounce - startNounce) << 4;`
			`uint32_t *inpHash = &g_hash[hashPosition];`

			`for (int i = 7; i >= 0; i--) {`
			`if (inpHash[i] > pTarget[i]) {`
			`return;`
			`}`
			`if (inpHash[i] < pTarget[i]) {`
			`break;`
			`}`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago			`}`
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`if (resNounce[0] > nounce)`
			`resNounce[0] = nounce;`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago			`}`
			`}`

			`__host__`
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t d_nonceVector, uint32_t d_inputHash, int order)`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago			`{`
			`uint32_t result = 0xffffffff;`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago
cleanup: use unsigned throughput parameters Yes, its a big commit, was waiting 1.6 to do that... Sorry for your possible merge issues ;) 10 years ago			`const uint32_t threadsperblock = 256;`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`dim3 grid((threads + threadsperblock-1)/threadsperblock);`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago			`dim3 block(threadsperblock);`

Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`MyStreamSynchronize(NULL, order, thr_id);`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago
checkhash: simplify the common function use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff 10 years ago			`cudaThreadSynchronize();`
Prepare multiple nonces support in one loop (if found) Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later... 10 years ago			`result = *h_resNonces[thr_id];`
checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup 10 years ago
			`return result;`
Allow test of SM 2.1/3.0 binaries on newer cards Implementation based on klausT work.. a bit different This code must be placed in a common .cu file, cuda.cpp is not compiled with nvcc and doesnt allow cuda code... 10 years ago			`}`

			`/* Function to get the compiled Shader Model version */`
			`int cuda_arch[MAX_GPUS] = { 0 };`
			`__global__`
			`void nvcc_get_arch(int *d_version)`
			`{`
			`#ifdef __CUDA_ARCH__`
			`*d_version = __CUDA_ARCH__;`
			`#endif`
			`}`

			`__host__`
			`int cuda_get_arch(int thr_id)`
			`{`
			`int *d_version;`
			`int dev_id = device_map[thr_id];`
			`if (cuda_arch[dev_id] == 0) {`
			`// only do it once...`
			`cudaMalloc(&d_version, sizeof(int));`
			`nvcc_get_arch <<< 1, 1 >>> (d_version);`
			`cudaMemcpy(&cuda_arch[dev_id], d_version, sizeof(int), cudaMemcpyDeviceToHost);`
			`cudaFree(d_version);`
			`}`
			`return cuda_arch[dev_id];`
			`}`