ccminer/quark/cuda_checkhash.cu

#include <stdio.h>
#include <memory.h>

#include "cuda_helper.h"

// Hash Target gegen das wir testen sollen
__constant__ uint32_t pTarget[8];

static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];

// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

__global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
{
	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
	if (thread < threads)
	{
		// bestimme den aktuellen Z<EFBFBD>hler
		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

		int hashPosition = nounce - startNounce;
		uint32_t *inpHash = &g_hash[16 * hashPosition];

		uint32_t hash[8];
#pragma unroll 8
		for (int i=0; i < 8; i++)
			hash[i] = inpHash[i];

		// kopiere Ergebnis
		int i, position = -1;
		bool rc = true;

#pragma unroll 8
		for (i = 7; i >= 0; i--) {
			if (hash[i] > pTarget[i]) {
				if(position < i) {
					position = i;
					rc = false;
				}
	 		}
	 		if (hash[i] < pTarget[i]) {
				if(position < i) {
					position = i;
					rc = true;
				}
	 		}
		}

		if(rc == true)
			if(resNounce[0] > nounce)
				resNounce[0] = nounce;
	}
}

// Setup-Funktionen
__host__ void cuda_check_cpu_init(int thr_id, int threads)
{
    cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
    cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
}

// Target Difficulty setzen
__host__ void cuda_check_cpu_setTarget(const void *ptarget)
{
	// die Message zur Berechnung auf der GPU
	cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
}

__host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
{
	uint32_t result = 0xffffffff;
	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));

	const int threadsperblock = 256;

	// berechne wie viele Thread Blocks wir brauchen
	dim3 grid((threads + threadsperblock-1)/threadsperblock);
	dim3 block(threadsperblock);

	// Gr<EFBFBD><EFBFBD>e des dynamischen Shared Memory Bereichs
	size_t shared_size = 0;

	cuda_check_gpu_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);

	// Strategisches Sleep Kommando zur Senkung der CPU Last
	MyStreamSynchronize(NULL, order, thr_id);

	// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

	// cudaMemcpy() ist asynchron!
	cudaThreadSynchronize();
	result = *h_resNounce[thr_id];

	return result;
}
-												Revision 0.6 with myriad-groestl and jackpot coin

											
										
										
											11 years ago
+								#include <stdio.h>
 								#include <memory.h>
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								#include "cuda_helper.h"
-												Implement x14 (cuda + cpu functions)

Project was updated for VS2013 and CUDA SDK 6.5

add also a --cputest function to dump cpu hash results

TODO: x15 is not fully functional, but first loop seems ok

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								// Hash Target gegen das wir testen sollen
-												Revision 0.6 with myriad-groestl and jackpot coin

											
										
										
											11 years ago
+								__constant__ uint32_t pTarget[8];
-												Release v1.4 with blake (NEOS)

Blake256: squashed commit...

Squashed commit of the following:

commit c370208bc92ef16557f66e5391faf2b1ad47726f
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Wed Sep 3 13:53:01 2014 +0200

    hashlog: prepare store of scanned range

commit e2cf49a5e956f03deafd266d1a0dd087a2041c99
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Wed Sep 3 12:54:13 2014 +0200

    stratum: store server time offset in context

commit 1a4391d7ff21397a128abf031f92733a8ac47437
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Tue Sep 2 12:40:52 2014 +0200

    hashlog: prevent double computing on jobs already done

commit 049e57730116685755bd3ff214f0793cce7c773b
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Wed Sep 3 09:49:14 2014 +0200

    tmp blake log

commit 43d3e93e1a97e569ead2437f759c6b8423d30c0a
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Wed Sep 3 09:29:51 2014 +0200

    blake: set a max throughput

commit 7e595a36ea69027c8a28023399540a761e7686c3
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Tue Sep 2 21:13:37 2014 +0200

    blake: cleanup, remove d_hash buf, not in a chain

    host: only bencode if gpu hash was found

commit de80c7e9d1448f15541d08c5dbbf372d5bfeba48
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Tue Sep 2 12:40:44 2014 +0200

    blake: remove unused parameter and fix index in d_hash

    that reduce the speed to 92MH/s but the next commit
    give us 30 more

    so, todo: merge the whole checkhash proc in gpu_hash
              and remove this d_hash buffer...

commit 2d42ae6de586a6ae8cbfd01806a273fd5cc4b262
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Tue Sep 2 05:09:31 2014 +0200

    stratum: handle a small cache of submitted jobs

    Prevent to send duplicated shares on some pools like hashharder..

    This cache keeps submitted job/nounces of the last 15 minutes

    so, remove exit on repeated duplicate shares,
        the submitted cache now handles this problem.

    Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

commit 1b8c3c12fa5bb83afbb02f9d5f60586939f36d86
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Tue Sep 2 03:38:57 2014 +0200

    debug: a new boolean to log or not json rpc data

commit 1f99aae0ff621f4f85f119d811a3f1a8d2204f60
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Mon Sep 1 18:49:23 2014 +0200

    exit on repeated duplicate shares (to enhance)

    create a new function proper_exit() to do common stuff on exit...

commit 530732458add6c4c3836606d028930f3581c0a5f
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Mon Sep 1 12:22:51 2014 +0200

    blake: use a constant for threads, reduce mallocated d_hash size

    and clean a bit more...

commit 0aeac878ef60840f3123354037cd56a89d2e94e6
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Mon Sep 1 06:12:55 2014 +0200

    blake: tune up and cleanup, ~100 MH/s on a normal 750Ti

    tested on linux and windows (x86 binary)...

    but there is a high number of duplicated shares... weird

commit 4a52d0553b0076b984be480725fa67689c544647
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Mon Sep 1 10:22:32 2014 +0200

    debug: show json methods, hide hash/target if ok

commit 1fb9becc1f2b6a15d8ccea4d8314df9ddf0af4ed
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Mon Sep 1 08:44:19 2014 +0200

    cpu-miner: sort algos by name, show reject reason

commit bfe96c49b0bf321ed0776cb1cf31c4fe8a0a8b8d
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Mon Aug 25 11:21:06 2014 +0200

    release 1.4, update README...

commit c17d11e37758c37762a7664a731fda6e9a5454b1
Author: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date:   Sun Aug 31 08:57:48 2014 +0200

    add "blake" 256, 14 rounds (for NEOS blake, not BlakeCoin)

    also remove "missing" file, its old and not compatible with ubuntu 14.04

to test on windows

blake: clean and optimize

Release v1.4 with blake (NEOS)

											
										
										
											10 years ago
+								static uint32_t *d_resNounce[8];
 								static uint32_t *h_resNounce[8];
-												Revision 0.6 with myriad-groestl and jackpot coin

											
										
										
											11 years ago
 								// aus heavy.cu
 								extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-												Implement x14 (cuda + cpu functions)

Project was updated for VS2013 and CUDA SDK 6.5

add also a --cputest function to dump cpu hash results

TODO: x15 is not fully functional, but first loop seems ok

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+								__global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
-												Revision 0.6 with myriad-groestl and jackpot coin

											
										
										
											11 years ago
+								{
 									int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 									if (thread < threads)
 									{
 										// bestimme den aktuellen Z<EFBFBD>hler
 										uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 										int hashPosition = nounce - startNounce;
 										uint32_t *inpHash = &g_hash[16 * hashPosition];
 										uint32_t hash[8];
 								#pragma unroll 8
 										for (int i=0; i < 8; i++)
 											hash[i] = inpHash[i];
 										// kopiere Ergebnis
 										int i, position = -1;
 										bool rc = true;
 								#pragma unroll 8
 										for (i = 7; i >= 0; i--) {
 											if (hash[i] > pTarget[i]) {
 												if(position < i) {
 													position = i;
 													rc = false;
 												}
 									 		}
 									 		if (hash[i] < pTarget[i]) {
 												if(position < i) {
 													position = i;
 													rc = true;
 												}
 									 		}
 										}
 										if(rc == true)
 											if(resNounce[0] > nounce)
 												resNounce[0] = nounce;
 									}
 								}
 								// Setup-Funktionen
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								__host__ void cuda_check_cpu_init(int thr_id, int threads)
-												Revision 0.6 with myriad-groestl and jackpot coin

											
										
										
											11 years ago
+								{
 								    cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
 								    cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
 								}
 								// Target Difficulty setzen
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								__host__ void cuda_check_cpu_setTarget(const void *ptarget)
-												Revision 0.6 with myriad-groestl and jackpot coin

											
										
										
											11 years ago
+								{
 									// die Message zur Berechnung auf der GPU
 									cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 								}
-												Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof

											
										
										
											10 years ago
+								__host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
-												Revision 0.6 with myriad-groestl and jackpot coin

											
										
										
											11 years ago
+								{
 									uint32_t result = 0xffffffff;
 									cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
 									const int threadsperblock = 256;
 									// berechne wie viele Thread Blocks wir brauchen
 									dim3 grid((threads + threadsperblock-1)/threadsperblock);
 									dim3 block(threadsperblock);
 									// Gr<EFBFBD><EFBFBD>e des dynamischen Shared Memory Bereichs
 									size_t shared_size = 0;
-												Implement x14 (cuda + cpu functions)

Project was updated for VS2013 and CUDA SDK 6.5

add also a --cputest function to dump cpu hash results

TODO: x15 is not fully functional, but first loop seems ok

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>

											
										
										
											10 years ago
+									cuda_check_gpu_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
-												Revision 0.6 with myriad-groestl and jackpot coin

											
										
										
											11 years ago
 									// Strategisches Sleep Kommando zur Senkung der CPU Last
 									MyStreamSynchronize(NULL, order, thr_id);
 									// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
 									cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 									// cudaMemcpy() ist asynchron!
 									cudaThreadSynchronize();
 									result = *h_resNounce[thr_id];
 									return result;
 								}