diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 4e726af..d66751a 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -34,6 +34,8 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st
 											uint32_t *d_nonces2, size_t *nrm2,
 											int order);
 
+extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
 static uint32_t *d_jackpotNonces[8];
 static uint32_t *d_branch1Nonces[8];
@@ -205,8 +207,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
 
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			unsigned int rounds;
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index e20f81d..aeafcd8 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -1,45 +1,16 @@
+/**
+ * This code compares final hash against target
+ */
 #include <stdio.h>
 #include <memory.h>
 
 #include "cuda_helper.h"
 
-// Hash Target gegen das wir testen sollen
 __constant__ uint32_t pTarget[8];
 
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
 
-__global__
-void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
-{
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// bestimme den aktuellen Zähler
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-		uint32_t hashPosition = (nounce - startNounce) << 4;
-		uint32_t *inpHash = &g_hash[hashPosition];
-		uint32_t hash[8];
-
-		#pragma unroll 8
-		for (int i=0; i < 8; i++)
-			hash[i] = inpHash[i];
-
-		for (int i = 7; i >= 0; i--) {
-			if (hash[i] > pTarget[i]) {
-				return;
-			}
-			if (hash[i] <= pTarget[i]) {
-				break;
-			}
-		}
-		if (resNounce[0] > nounce)
-			resNounce[0] = nounce;
-	}
-}
-
-// Setup-Funktionen
 __host__
 void cuda_check_cpu_init(int thr_id, int threads)
 {
@@ -54,71 +25,134 @@ void cuda_check_cpu_setTarget(const void *ptarget)
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 }
 
+/* --------------------------------------------------------------------------------------------- */
+
+__device__ __forceinline__
+static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
+{
+	if (hash[7] > target[7])
+		return false;
+	if (hash[7] < target[7])
+		return true;
+	if (hash[6] > target[6])
+		return false;
+	if (hash[6] < target[6])
+		return true;
+
+	if (hash[5] > target[5])
+		return false;
+	if (hash[5] < target[5])
+		return true;
+	if (hash[4] > target[4])
+		return false;
+	if (hash[4] < target[4])
+		return true;
+
+	if (hash[3] > target[3])
+		return false;
+	if (hash[3] < target[3])
+		return true;
+	if (hash[2] > target[2])
+		return false;
+	if (hash[2] < target[2])
+		return true;
+
+	if (hash[1] > target[1])
+		return false;
+	if (hash[1] < target[1])
+		return true;
+	if (hash[0] > target[0])
+		return false;
+
+	return true;
+}
+
+__global__ __launch_bounds__(512, 4)
+void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// shl 4 = *16 x 4 (uint32) = 64 bytes
+		uint32_t *inpHash = &hash[thread << 4];
+
+		if (hashbelowtarget(inpHash, pTarget)) {
+			uint32_t nounce = (startNounce + thread);
+			resNounce[0] = nounce;
+		}
+	}
+}
+
 __host__
-uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
 
-	const int threadsperblock = 256;
+	const int threadsperblock = 512;
 
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	cuda_check_gpu_hash_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
 
-	// Strategisches Sleep Kommando zur Senkung der CPU Last
-	MyStreamSynchronize(NULL, order, thr_id);
+	cudaThreadSynchronize();
 
-	// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
 	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-	// cudaMemcpy() ist asynchron!
-	cudaThreadSynchronize();
 	result = *h_resNounce[thr_id];
 
 	return result;
 }
 
+/* --------------------------------------------------------------------------------------------- */
+
 __global__
-void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce)
+void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		/* only test the last 2 dwords, ok for most algos */
-		int hashPos = thread << 4;
-		uint32_t *inpHash = &hashEnd[hashPos];
+		uint32_t nounce = g_nonceVector[thread];
+		uint32_t hashPosition = (nounce - startNounce) << 4;
+		uint32_t *inpHash = &g_hash[hashPosition];
+		//uint32_t hash[8];
 
-		if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) {
-			uint32_t nounce = (startNounce + thread);
-			if (resNounce[0] > nounce)
-				resNounce[0] = nounce;
+		//#pragma unroll 8
+		//for (int i=0; i < 8; i++)
+		//	hash[i] = inpHash[i];
+
+		for (int i = 7; i >= 0; i--) {
+			if (inpHash[i] > pTarget[i]) {
+				return;
+			}
+			if (inpHash[i] < pTarget[i]) {
+				break;
+			}
 		}
+		if (resNounce[0] > nounce)
+			resNounce[0] = nounce;
 	}
 }
 
 __host__
-uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
+uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
 
 	const int threadsperblock = 256;
 
-	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	cuda_check_gpu_hash_fast <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
 
-	// MyStreamSynchronize(NULL, order, thr_id);
-	cudaThreadSynchronize();
+	MyStreamSynchronize(NULL, order, thr_id);
 
 	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
-	// cudaMemcpy() was asynchron ?
-	// cudaThreadSynchronize();
+	cudaThreadSynchronize();
 	result = *h_resNounce[thr_id];
 
 	return result;
-}
+}
\ No newline at end of file
diff --git a/cuda_helper.h b/cuda_helper.h
index f22100e..255ce45 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -19,8 +19,7 @@ extern "C"  long device_sm[8];
 // common functions
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
-extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
+extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
 
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index d0c85ad..ad4a21f 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -11,7 +11,6 @@ extern "C"
 
 #include "cuda_helper.h"
 
-// Speicher für Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];
 
 extern void quark_blake512_cpu_init(int thr_id, int threads);
@@ -113,8 +112,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
diff --git a/quark/animecoin.cu b/quark/animecoin.cu
index e526f4c..e3205ad 100644
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@@ -48,7 +48,9 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
 											uint32_t *d_nonces1, size_t *nrm1,
 											int order);
 
-// Original Quarkhash Funktion aus einem miner Quelltext
+extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
+/* CPU Hash */
 extern "C" void animehash(void *state, const void *input)
 {
     sph_blake512_context ctx_blake;
@@ -255,7 +257,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 
 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 337d14e..f52337f 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -50,6 +50,8 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
 											uint32_t *d_nonces1, size_t *nrm1,
 											int order);
 
+extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
 // Original Quarkhash Funktion aus einem miner Quelltext
 extern "C" void quarkhash(void *state, const void *input)
 {
@@ -227,7 +229,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 
 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
diff --git a/qubit/deep.cu b/qubit/deep.cu
index e99818a..407ce25 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -93,7 +93,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index 205e6f2..92375bf 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -120,7 +120,7 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
diff --git a/x11/fresh.cu b/x11/fresh.cu
index a6cbb8c..40c2455 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -123,7 +123,7 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
 		print_hash((unsigned char*)buf); printf("\n");
 #endif
 
-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
diff --git a/x11/s3.cu b/x11/s3.cu
index ce608bf..4997f2d 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -99,7 +99,7 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 
 		if (foundNonce != 0xffffffff)
 		{
diff --git a/x11/x11.cu b/x11/x11.cu
index 68b71e4..6c77c34 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -58,10 +58,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
                                           uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
 
-// to check... new sp method
-//extern void x11_echo512_cpu_setTarget(const void *ptarget);
-//extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
 // X11 CPU Hash
 extern "C" void x11hash(void *output, const void *input)
 {
@@ -172,7 +168,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 
 	quark_blake512_cpu_setBlock_80((void*)endiandata);
 
-	//x11_echo512_cpu_setTarget(ptarget);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
@@ -193,11 +188,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-		// todo...
-		//foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
diff --git a/x13/x13.cu b/x13/x13.cu
index 5a73bd1..fb204f1 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -208,8 +208,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-		// Scan nach Gewinner Hashes auf der GPU
-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if  (foundNonce != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
diff --git a/x15/x14.cu b/x15/x14.cu
index ec50c42..39c6004 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -215,7 +215,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			/* check now with the CPU to confirm */
diff --git a/x15/x15.cu b/x15/x15.cu
index 00ff0a5..c9bff02 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -229,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
 		/* Scan with GPU */
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 
 		if (foundNonce != 0xffffffff)
 		{
diff --git a/x17/x17.cu b/x17/x17.cu
index 17cb73e..1809a70 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -253,7 +253,7 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];