ccminer-gostd-lite/cuda_checkhash.cu

#include <stdio.h>
#include <memory.h>

#include "cuda_helper.h"

// Hash Target gegen das wir testen sollen
__constant__ uint32_t pTarget[8];

static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];

__global__
void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{
	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
		// bestimme den aktuellen Zähler
		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

		int hashPosition = nounce - startNounce;
		uint32_t *inpHash = &g_hash[hashPosition<<4];

		uint32_t hash[8];

		#pragma unroll 8
		for (int i=0; i < 8; i++)
			hash[i] = inpHash[i];

		for (int i = 7; i >= 0; i--) {
			if (hash[i] > pTarget[i]) {
				return;
			}
			if (hash[i] < pTarget[i]) {
				break;
			}
		}

		if(resNounce[0] > nounce)
			resNounce[0] = nounce;
	}
}

// Setup-Funktionen
__host__
void cuda_check_cpu_init(int thr_id, int threads)
{
    cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
    cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
}

// Target Difficulty setzen
__host__
void cuda_check_cpu_setTarget(const void *ptarget)
{
	// die Message zur Berechnung auf der GPU
	cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}

__host__
uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
	uint32_t result = 0xffffffff;
	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));

	const int threadsperblock = 256;

	// berechne wie viele Thread Blocks wir brauchen
	dim3 grid((threads + threadsperblock-1)/threadsperblock);
	dim3 block(threadsperblock);

	// Größe des dynamischen Shared Memory Bereichs
	size_t shared_size = 0;

	cuda_check_gpu_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);

	// Strategisches Sleep Kommando zur Senkung der CPU Last
	MyStreamSynchronize(NULL, order, thr_id);

	// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

	// cudaMemcpy() ist asynchron!
	cudaThreadSynchronize();
	result = *h_resNounce[thr_id];

	return result;
}
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`#include <stdio.h>`
			`#include <memory.h>`

Remove duplicated defines present in cuda_helper.h also add cudaDeviceReset() on Ctrl+C for nvprof 10 years ago			`#include "cuda_helper.h"`

Implement x14 (cuda + cpu functions) Project was updated for VS2013 and CUDA SDK 6.5 add also a --cputest function to dump cpu hash results TODO: x15 is not fully functional, but first loop seems ok Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`// Hash Target gegen das wir testen sollen`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`__constant__ uint32_t pTarget[8];`

add "blake" 256, 14 rounds (for NEOS blake, not BlakeCoin) also remove "missing" file, its old and not compatible with ubuntu 14.04 10 years ago			`static uint32_t *d_resNounce[8];`
			`static uint32_t *h_resNounce[8];`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago
Move common check_cpu functions to root 10 years ago			`__global__`
			`void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t g_nonceVector, uint32_t g_hash, uint32_t *resNounce)`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`{`
			`int thread = (blockDim.x * blockIdx.x + threadIdx.x);`
			`if (thread < threads)`
			`{`
Move common check_cpu functions to root 10 years ago			`// bestimme den aktuellen Zähler`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);`

			`int hashPosition = nounce - startNounce;`
Move common check_cpu functions to root 10 years ago			`uint32_t *inpHash = &g_hash[hashPosition<<4];`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago
			`uint32_t hash[8];`
Move common check_cpu functions to root 10 years ago
			`#pragma unroll 8`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`for (int i=0; i < 8; i++)`
			`hash[i] = inpHash[i];`

Move common check_cpu functions to root 10 years ago			`for (int i = 7; i >= 0; i--) {`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`if (hash[i] > pTarget[i]) {`
Move common check_cpu functions to root 10 years ago			`return;`
			`}`
			`if (hash[i] < pTarget[i]) {`
			`break;`
			`}`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`}`

Move common check_cpu functions to root 10 years ago			`if(resNounce[0] > nounce)`
			`resNounce[0] = nounce;`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`}`
			`}`

			`// Setup-Funktionen`
Move common check_cpu functions to root 10 years ago			`__host__`
			`void cuda_check_cpu_init(int thr_id, int threads)`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`{`
			`cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));`
			`cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));`
			`}`

			`// Target Difficulty setzen`
Move common check_cpu functions to root 10 years ago			`__host__`
			`void cuda_check_cpu_setTarget(const void *ptarget)`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`{`
			`// die Message zur Berechnung auf der GPU`
Move common check_cpu functions to root 10 years ago			`cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`}`

Move common check_cpu functions to root 10 years ago			`__host__`
			`uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t d_nonceVector, uint32_t d_inputHash, int order)`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`{`
			`uint32_t result = 0xffffffff;`
			`cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));`

			`const int threadsperblock = 256;`

			`// berechne wie viele Thread Blocks wir brauchen`
			`dim3 grid((threads + threadsperblock-1)/threadsperblock);`
			`dim3 block(threadsperblock);`

Move common check_cpu functions to root 10 years ago			`// Größe des dynamischen Shared Memory Bereichs`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago			`size_t shared_size = 0;`

Implement x14 (cuda + cpu functions) Project was updated for VS2013 and CUDA SDK 6.5 add also a --cputest function to dump cpu hash results TODO: x15 is not fully functional, but first loop seems ok Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com> 10 years ago			`cuda_check_gpu_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);`
Revision 0.6 with myriad-groestl and jackpot coin 10 years ago
			`// Strategisches Sleep Kommando zur Senkung der CPU Last`
			`MyStreamSynchronize(NULL, order, thr_id);`

			`// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)`
			`cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);`

			`// cudaMemcpy() ist asynchron!`
			`cudaThreadSynchronize();`
			`result = *h_resNounce[thr_id];`

			`return result;`
			`}`