Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof
10 years ago · d9ea5f72ce
44 changed files with 1129 additions and 1619 deletions
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@ -1,11 +1,8 @@
				@@ -1,11 +1,8 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "sm_30_intrinsics.h"
-
 #include <stdio.h>
 #include <memory.h>
-#include <stdint.h>
+
+#include "cuda_helper.h"
+#include <sm_30_intrinsics.h>

 // aus cpu-miner.c
 extern int device_map[8];
@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
				@@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }

-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 /**
 * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
 */
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@ -1,16 +1,7 @@
				@@ -1,16 +1,7 @@
-
-
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 __constant__ uint64_t c_State[25];
 __constant__ uint32_t c_PaddedMessage[18];

-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
-
 #define U32TO64_LE(p) \
    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))

--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -1,4 +1,3 @@
				@@ -1,4 +1,3 @@
-
 extern "C"
 {
 #include "sph/sph_keccak.h"
@ -7,10 +6,9 @@ extern "C"
				@@ -7,10 +6,9 @@ extern "C"
 #include "sph/sph_jh.h"
 #include "sph/sph_skein.h"
 #include "miner.h"
+#include "cuda_helper.h"
 }

-#include <stdint.h>
-
 // aus cpu-miner.c
 extern int device_map[8];

@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
				@@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
 extern void quark_skein512_cpu_init(int thr_id, int threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
 extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
				@@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2);
 		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
 		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
				@@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
				@@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 		}

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
+			unsigned int rounds;
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);

 			// diese jackpothash Funktion gibt die Zahl der Runden zurück
-			unsigned int rounds = jackpothash(vhash64, endiandata);
+			rounds = jackpothash(vhash64, endiandata);

 			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {

--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@ -1,5 +1,4 @@
				@@ -1,5 +1,4 @@
-
-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 /**
 * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
 * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
				@@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>false</Keep>
      <CodeGeneration>compute_50,sm_50</CodeGeneration>
-      <Defines></Defines>
+      <Defines>
+      </Defines>
    </CudaCompile>
    <CudaLink>
      <GPUDebugInfo>false</GPUDebugInfo>
@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
				@@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
    <ClInclude Include="uint256.h" />
  </ItemGroup>
  <ItemGroup>
+    <CudaCompile Include="bitslice_transformations_quad.cu">
+      <ExcludedFromBuild>true</ExcludedFromBuild>
+    </CudaCompile>
    <CudaCompile Include="cuda_fugue256.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
				@@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
+    <CudaCompile Include="groestl_functions_quad.cu">
+      <ExcludedFromBuild>true</ExcludedFromBuild>
+    </CudaCompile>
    <CudaCompile Include="heavy\cuda_blake512.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -391,5 +391,11 @@
				@@ -391,5 +391,11 @@
    <CudaCompile Include="x15\cuda_x15_whirlpool.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
+    <CudaCompile Include="groestl_functions_quad.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="bitslice_transformations_quad.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/cuda_fugue256.cu
+++ b/cuda_fugue256.cu
@ -1,12 +1,11 @@
				@@ -1,12 +1,11 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

 #include "sph/sph_fugue.h"

+#include "cuda_helper.h"
+#include <host_defines.h>
+
 #define USE_SHARED 1

 // aus cpu-miner.c
@ -15,14 +14,6 @@ extern int device_map[8];
				@@ -15,14 +14,6 @@ extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-// schon in sph_fugue.h definiert
-//#define SPH_C32(x)	((uint32_t)(x ## U))
-
 uint32_t *d_fugue256_hashoutput[8];
 uint32_t *d_resultNonce[8];

--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -1,23 +1,17 @@
				@@ -1,23 +1,17 @@
 // Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice

-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

+#include "cuda_helper.h"
+#include <host_defines.h>
+
 // aus cpu-miner.c
 extern int device_map[8];

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];

@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
				@@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"

-#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+#define SWAB32(x) cuda_swab32(x)

-__global__ void __launch_bounds__(256, 4)
- groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
+__global__ __launch_bounds__(256, 4)
+void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -1,33 +1,78 @@
				@@ -1,33 +1,78 @@
 #ifndef CUDA_HELPER_H
 #define CUDA_HELPER_H

+#include <cuda.h>
 #include <cuda_runtime.h>

-static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
-{
-#if __CUDA_ARCH__ >= 130
-    return __double_as_longlong(__hiloint2double(HI, LO));
+#if defined(_MSC_VER)
+/* reduce warnings */
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#endif
+
+#include <stdint.h>
+
+extern __device__ __device_builtin__ void __syncthreads(void);
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+#ifndef SPH_C32
+#define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+
+#ifndef SPH_C64
+#define SPH_C64(x) ((uint64_t)(x ## ULL))
+#endif
+
+#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+
+#if __CUDA_ARCH__ < 350
+// Kepler (Compute 3.0)
+#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
 #else
-	return (unsigned long long)LO | (((unsigned long long)HI) << 32);
+// Kepler (Compute 3.5, 5.0)
+#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
 #endif
-}

-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const uint64_t &x) {
+__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+{
 #if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
+	return __double_as_longlong(__hiloint2double(HI, LO));
 #else
-	return (uint32_t)(x >> 32);
+	return (unsigned long long)LO | (((unsigned long long)HI) << 32);
 #endif
 }

 // das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
+__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
 	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
 }

+// das Lo Word in einem 64 Bit Typen ersetzen
+__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+}
+
+// Endian Drehung für 32 Bit Typen
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
+{
+	/* device */
+	return __byte_perm(x, x, 0x0123);
+}
+#else
+	/* host */
+	#define cuda_swab32(x) \
+	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
+		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#endif
+
 // das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
+__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
 #if __CUDA_ARCH__ >= 130
 	return (uint32_t)__double2loint(__longlong_as_double(x));
 #else
@ -35,34 +80,51 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
				@@ -35,34 +80,51 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
 #endif
 }

-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+// das Hi Word aus einem 64 Bit Typen extrahieren
+__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
+#if __CUDA_ARCH__ >= 130
+	return (uint32_t)__double2hiint(__longlong_as_double(x));
+#else
+	return (uint32_t)(x >> 32);
+#endif
 }

-// Endian Drehung für 32 Bit Typen
-static __device__ uint32_t cuda_swab32(uint32_t x)
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
 {
-	return __byte_perm(x, x, 0x0123);
-}
+	// Input:       77665544 33221100
+	// Output:      00112233 44556677
+	uint64_t temp[2];
+	temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
+	temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);

-// Endian Drehung für 64 Bit Typen
-static __device__ uint64_t cuda_swab64(uint64_t x) {
-    return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
+	return temp[0] | (temp[1]<<32);
 }
+#else
+	/* host */
+	#define cuda_swab64(x) \
+		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
+			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+#endif

 // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
 #if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset < 32) {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
+__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return __double_as_longlong(__hiloint2double(result.y, result.x));
 }
 #else
 #define ROTR64(x, n)        (((x) >> (n)) | ((x) << (64 - (n))))
@ -70,16 +132,16 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse
				@@ -70,16 +132,16 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse

 // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
 #if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
+__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
+	uint2 result;
+	if(offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+	} else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
+	}
+	return  __double_as_longlong(__hiloint2double(result.y, result.x));
 }
 #else
 #define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@ -1,23 +1,16 @@
				@@ -1,23 +1,16 @@
 // Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice

-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

+#include "cuda_helper.h"
+
 // aus cpu-miner.c
 extern int device_map[8];

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];

--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -1,4 +1,3 @@
				@@ -1,4 +1,3 @@
-
 extern "C"
 {
 #include "sph/sph_blake.h"
@ -7,10 +6,9 @@ extern "C"
				@@ -7,10 +6,9 @@ extern "C"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "miner.h"
+#include "cuda_helper.h"
 }

-#include <stdint.h>
-
 // aus cpu-miner.c
 extern int device_map[8];

@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
				@@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void quark_skein512_cpu_init(int thr_id, int threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 // Original nist5hash Funktion aus einem miner Quelltext
-inline void nist5hash(void *state, const void *input)
+extern "C" void nist5hash(void *state, const void *input)
 {
    sph_blake512_context ctx_blake;
    sph_groestl512_context ctx_groestl;
@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
				@@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}

@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
				@@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;

-		// erstes Blake512 Hash mit CUDA
+		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Groestl512
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für JH512
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Keccak512
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Skein512
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/groestl_functions_quad.cu
+++ b/groestl_functions_quad.cu
@ -1,3 +1,4 @@
				@@ -1,3 +1,4 @@
+#include "cuda_helper.h"

 __device__ __forceinline__ void G256_Mul2(uint32_t *regs)
 {
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@ -1,14 +1,7 @@
				@@ -1,14 +1,7 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *d_heftyHashes[8];
@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
				@@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
 // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)

-#include "cuda_helper.h"

 // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------

@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
				@@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };

-// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
-#define SWAP32(x) \
-    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
-      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
-
-// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
-#define SWAP64(x) \
-    ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
-                (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
-                (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
-                (((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
-                (((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
-                (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
-                (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
-                (((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+/* in cuda_helper */
+#define SWAP32(x) cuda_swab32(x)
+#define SWAP64(x) cuda_swab64(x)

 __constant__ uint64_t c_SecondRound[15];

--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@ -1,9 +1,4 @@
				@@ -1,9 +1,4 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-// Folgende Definitionen später durch header ersetzen
-typedef unsigned int uint32_t;
+#include "cuda_helper.h"

 // globaler Speicher für unsere Ergebnisse
 uint32_t *d_hashoutput[8];
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@ -1,14 +1,7 @@
				@@ -1,14 +1,7 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *d_heftyHashes[8];
@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
				@@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
 	cudaMemcpyToSymbol(	groestl_gpu_msg,
 						msgBlock,
 						128);
-	
 	BLOCKSIZE = len;
 }

--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@ -1,10 +1,9 @@
				@@ -1,10 +1,9 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

+#include "cuda_helper.h"
+#include <device_functions.h>
+
 #define USE_SHARED 1

 // aus cpu-miner.c
@ -13,11 +12,6 @@ extern int device_map[8];
				@@ -13,11 +12,6 @@ extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned int uint32_t;
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];

--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@ -1,14 +1,7 @@
				@@ -1,14 +1,7 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *d_heftyHashes[8];
@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
				@@ -81,8 +74,8 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
    uint64_t t[5], u[5], v, w;

    /* absorb input */
-#pragma unroll 9
-    for (i = 0; i < 72 / 8; i++, in += 2)
+    #pragma unroll 9
+    for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
        s[i] ^= U32TO64_LE(in);
    
    for (i = 0; i < 24; i++) {
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@ -1,12 +1,7 @@
				@@ -1,12 +1,7 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned int uint32_t;
+#include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *d_heftyHashes[8];
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@ -1,7 +1,3 @@
				@@ -1,7 +1,3 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 #include <string.h>
@ -34,6 +30,8 @@
				@@ -34,6 +30,8 @@
 #include "heavy/cuda_blake512.h"
 #include "heavy/cuda_combine.h"

+#include "cuda_helper.h"
+
 extern uint32_t *d_hash2output[8];
 extern uint32_t *d_hash3output[8];
 extern uint32_t *d_hash4output[8];
--- a/miner.h
+++ b/miner.h
@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
				@@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
+void nist5hash(void *state, const void *input);
 void quarkhash(void *state, const void *input);
 void x11hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -1,291 +1,292 @@
				@@ -1,291 +1,292 @@
-
-extern "C"
-{
-#include "sph/sph_blake.h"
-#include "sph/sph_bmw.h"
-#include "sph/sph_groestl.h"
-#include "sph/sph_skein.h"
-#include "sph/sph_jh.h"
-#include "sph/sph_keccak.h"
-#include "miner.h"
-}
-
-#include <stdint.h>
-
-// aus cpu-miner.c
-extern int device_map[8];
-
-// Speicher für Input/Output der verketteten Hashfunktionen
-static uint32_t *d_hash[8];
-
-// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
-static uint32_t *d_animeNonces[8];
-static uint32_t *d_branch1Nonces[8];
-static uint32_t *d_branch2Nonces[8];
-static uint32_t *d_branch3Nonces[8];
-
-extern void quark_blake512_cpu_init(int thr_id, int threads);
-extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_bmw512_cpu_init(int thr_id, int threads);
-extern void quark_bmw512_cpu_setBlock_80(void *pdata);
-extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
-extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
-
-extern void quark_groestl512_cpu_init(int thr_id, int threads);
-extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_skein512_cpu_init(int thr_id, int threads);
-extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_keccak512_cpu_init(int thr_id, int threads);
-extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_jh512_cpu_init(int thr_id, int threads);
-extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-
-extern void quark_compactTest_cpu_init(int thr_id, int threads);
-extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											uint32_t *d_nonces2, size_t *nrm2,
-											int order);
-extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											int order);
-
-// Original Quarkhash Funktion aus einem miner Quelltext
-inline void animehash(void *state, const void *input)
-{
-    sph_blake512_context ctx_blake;
-    sph_bmw512_context ctx_bmw;
-    sph_groestl512_context ctx_groestl;
-    sph_jh512_context ctx_jh;
-    sph_keccak512_context ctx_keccak;
-    sph_skein512_context ctx_skein;
-    
-    unsigned char hash[64];
-
-    sph_bmw512_init(&ctx_bmw);
-    // ZBMW;
-    sph_bmw512 (&ctx_bmw, (const void*) input, 80);
-    sph_bmw512_close(&ctx_bmw, (void*) hash);
-
-    sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
-    sph_blake512 (&ctx_blake, hash, 64);
-    sph_blake512_close(&ctx_blake, (void*) hash);
-    
-    if (hash[0] & 0x8)
-    {
-        sph_groestl512_init(&ctx_groestl);
-        // ZGROESTL;
-        sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-        sph_groestl512_close(&ctx_groestl, (void*) hash);
-    }
-    else
-    {
-        sph_skein512_init(&ctx_skein);
-        // ZSKEIN;
-        sph_skein512 (&ctx_skein, (const void*) hash, 64);
-        sph_skein512_close(&ctx_skein, (void*) hash);
-    }
-    
-    sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
-    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
-    sph_groestl512_close(&ctx_groestl, (void*) hash);
-
-    sph_jh512_init(&ctx_jh);
-    // ZJH;
-    sph_jh512 (&ctx_jh, (const void*) hash, 64);
-    sph_jh512_close(&ctx_jh, (void*) hash);
-
-    if (hash[0] & 0x8)
-    {
-        sph_blake512_init(&ctx_blake);
-        // ZBLAKE;
-        sph_blake512 (&ctx_blake, (const void*) hash, 64);
-        sph_blake512_close(&ctx_blake, (void*) hash);
-    }
-    else
-    {
-        sph_bmw512_init(&ctx_bmw);
-        // ZBMW;
-        sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
-        sph_bmw512_close(&ctx_bmw, (void*) hash);
-    }
-
-    sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
-    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-    sph_keccak512_close(&ctx_keccak, (void*) hash);
-
-    sph_skein512_init(&ctx_skein);
-    // SKEIN;
-    sph_skein512 (&ctx_skein, (const void*) hash, 64);
-    sph_skein512_close(&ctx_skein, (void*) hash);
-
-    if (hash[0] & 0x8)
-    {
-        sph_keccak512_init(&ctx_keccak);
-        // ZKECCAK;
-        sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
-        sph_keccak512_close(&ctx_keccak, (void*) hash);
-    }
-    else
-    {
-        sph_jh512_init(&ctx_jh);
-        // ZJH;
-        sph_jh512 (&ctx_jh, (const void*) hash, 64);
-        sph_jh512_close(&ctx_jh, (void*) hash);
-    }
-
-    memcpy(state, hash, 32);
-}
-
-
-struct HashPredicate
-{    
-    HashPredicate(uint32_t *hashes, uint32_t startNonce) :
-        m_hashes(hashes),
-        m_startNonce(startNonce)
-        { }
-
-    __device__
-    bool operator()(const uint32_t x)
-    {
-        uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
-        return hash[0] & 0x8;
-    }
-
-    uint32_t *m_hashes;
-    uint32_t  m_startNonce;
-};
-
-extern bool opt_benchmark;
-
-extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done)
-{
-	const uint32_t first_nonce = pdata[19];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x00000f;
-
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*2048; // 100;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
-	if (!init[thr_id])
-	{
-		cudaSetDevice(device_map[thr_id]);
-
-		// Konstanten kopieren, Speicher belegen
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
-		quark_blake512_cpu_init(thr_id, throughput);
-		quark_groestl512_cpu_init(thr_id, throughput);
-		quark_skein512_cpu_init(thr_id, throughput);
-		quark_bmw512_cpu_init(thr_id, throughput);
-		quark_keccak512_cpu_init(thr_id, throughput);
-		quark_jh512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
-		quark_compactTest_cpu_init(thr_id, throughput);
-		cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
-		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
-		init[thr_id] = true;
-	}
-
-	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
-		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
-
-	quark_bmw512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
-
-	do {
-		int order = 0;
-		size_t nrm1=0, nrm2=0, nrm3=0;
-
-		// erstes BMW512 Hash mit CUDA
-		quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Blake512
-		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
-				d_branch3Nonces[thr_id], &nrm3,
-				order++);
-		
-		// nur den Skein Branch weiterverfolgen
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Groestl512
-		quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für JH512
-		quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
-		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
-
-		// das ist der bedingte Branch für Blake512
-		quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der bedingte Branch für Bmw512
-		quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Skein512
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
-		// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
-		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
-			d_branch1Nonces[thr_id], &nrm1,
-			d_branch2Nonces[thr_id], &nrm2,
-			order++);
-
-		// das ist der bedingte Branch für Keccak512
-		quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
-
-		// das ist der bedingte Branch für JH512
-		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
-
-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
-		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			animehash(vhash64, endiandata);
-
-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
-
-				pdata[19] = foundNonce;
-				*hashes_done = (foundNonce - first_nonce + 1)/2;
-				return 1;
-			} else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
-			}
-		}
-
-		pdata[19] += throughput;
-
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-
-	*hashes_done = (pdata[19] - first_nonce + 1)/2;
-	return 0;
-}
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "miner.h"
+#include "cuda_helper.h"
+}
+
+// aus cpu-miner.c
+extern int device_map[8];
+
+// Speicher für Input/Output der verketteten Hashfunktionen
+static uint32_t *d_hash[8];
+
+// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
+static uint32_t *d_animeNonces[8];
+static uint32_t *d_branch1Nonces[8];
+static uint32_t *d_branch2Nonces[8];
+static uint32_t *d_branch3Nonces[8];
+
+extern void quark_blake512_cpu_init(int thr_id, int threads);
+extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_bmw512_cpu_init(int thr_id, int threads);
+extern void quark_bmw512_cpu_setBlock_80(void *pdata);
+extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
+extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+extern void quark_groestl512_cpu_init(int thr_id, int threads);
+extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_skein512_cpu_init(int thr_id, int threads);
+extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_keccak512_cpu_init(int thr_id, int threads);
+extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_jh512_cpu_init(int thr_id, int threads);
+extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
+extern void quark_compactTest_cpu_init(int thr_id, int threads);
+extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+											uint32_t *d_nonces1, size_t *nrm1,
+											uint32_t *d_nonces2, size_t *nrm2,
+											int order);
+extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+											uint32_t *d_nonces1, size_t *nrm1,
+											int order);
+
+// Original Quarkhash Funktion aus einem miner Quelltext
+inline void animehash(void *state, const void *input)
+{
+    sph_blake512_context ctx_blake;
+    sph_bmw512_context ctx_bmw;
+    sph_groestl512_context ctx_groestl;
+    sph_jh512_context ctx_jh;
+    sph_keccak512_context ctx_keccak;
+    sph_skein512_context ctx_skein;
+    
+    unsigned char hash[64];
+
+    sph_bmw512_init(&ctx_bmw);
+    // ZBMW;
+    sph_bmw512 (&ctx_bmw, (const void*) input, 80);
+    sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+    sph_blake512_init(&ctx_blake);
+    // ZBLAKE;
+    sph_blake512 (&ctx_blake, hash, 64);
+    sph_blake512_close(&ctx_blake, (void*) hash);
+    
+    if (hash[0] & 0x8)
+    {
+        sph_groestl512_init(&ctx_groestl);
+        // ZGROESTL;
+        sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+        sph_groestl512_close(&ctx_groestl, (void*) hash);
+    }
+    else
+    {
+        sph_skein512_init(&ctx_skein);
+        // ZSKEIN;
+        sph_skein512 (&ctx_skein, (const void*) hash, 64);
+        sph_skein512_close(&ctx_skein, (void*) hash);
+    }
+    
+    sph_groestl512_init(&ctx_groestl);
+    // ZGROESTL;
+    sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
+    sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+    sph_jh512_init(&ctx_jh);
+    // ZJH;
+    sph_jh512 (&ctx_jh, (const void*) hash, 64);
+    sph_jh512_close(&ctx_jh, (void*) hash);
+
+    if (hash[0] & 0x8)
+    {
+        sph_blake512_init(&ctx_blake);
+        // ZBLAKE;
+        sph_blake512 (&ctx_blake, (const void*) hash, 64);
+        sph_blake512_close(&ctx_blake, (void*) hash);
+    }
+    else
+    {
+        sph_bmw512_init(&ctx_bmw);
+        // ZBMW;
+        sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
+        sph_bmw512_close(&ctx_bmw, (void*) hash);
+    }
+
+    sph_keccak512_init(&ctx_keccak);
+    // ZKECCAK;
+    sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+    sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+    sph_skein512_init(&ctx_skein);
+    // SKEIN;
+    sph_skein512 (&ctx_skein, (const void*) hash, 64);
+    sph_skein512_close(&ctx_skein, (void*) hash);
+
+    if (hash[0] & 0x8)
+    {
+        sph_keccak512_init(&ctx_keccak);
+        // ZKECCAK;
+        sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
+        sph_keccak512_close(&ctx_keccak, (void*) hash);
+    }
+    else
+    {
+        sph_jh512_init(&ctx_jh);
+        // ZJH;
+        sph_jh512 (&ctx_jh, (const void*) hash, 64);
+        sph_jh512_close(&ctx_jh, (void*) hash);
+    }
+
+    memcpy(state, hash, 32);
+}
+
+
+struct HashPredicate
+{    
+    HashPredicate(uint32_t *hashes, uint32_t startNonce) :
+        m_hashes(hashes),
+        m_startNonce(startNonce)
+        { }
+
+    __device__
+    bool operator()(const uint32_t x)
+    {
+        uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
+        return hash[0] & 0x8;
+    }
+
+    uint32_t *m_hashes;
+    uint32_t  m_startNonce;
+};
+
+extern bool opt_benchmark;
+
+extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
+    const uint32_t *ptarget, uint32_t max_nonce,
+    unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00000f;
+
+	const uint32_t Htarg = ptarget[7];
+
+	const int throughput = 256*2048; // 100;
+
+	static bool init[8] = {0,0,0,0,0,0,0,0};
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+
+		// Konstanten kopieren, Speicher belegen
+		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
+		quark_compactTest_cpu_init(thr_id, throughput);
+
+		cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
+		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
+		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
+		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+	quark_bmw512_cpu_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+		size_t nrm1=0, nrm2=0, nrm3=0;
+
+		// erstes BMW512 Hash mit CUDA
+		quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		// das ist der unbedingte Branch für Blake512
+		quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
+				d_branch3Nonces[thr_id], &nrm3,
+				order++);
+		
+		// nur den Skein Branch weiterverfolgen
+		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+		// das ist der unbedingte Branch für Groestl512
+		quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+		// das ist der unbedingte Branch für JH512
+		quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+		// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
+		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
+			d_branch1Nonces[thr_id], &nrm1,
+			d_branch2Nonces[thr_id], &nrm2,
+			order++);
+
+		// das ist der bedingte Branch für Blake512
+		quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
+
+		// das ist der bedingte Branch für Bmw512
+		quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+
+		// das ist der unbedingte Branch für Keccak512
+		quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+		// das ist der unbedingte Branch für Skein512
+		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
+		// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
+		quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
+			d_branch1Nonces[thr_id], &nrm1,
+			d_branch2Nonces[thr_id], &nrm2,
+			order++);
+
+		// das ist der bedingte Branch für Keccak512
+		quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
+
+		// das ist der bedingte Branch für JH512
+		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
+
+		// Scan nach Gewinner Hashes auf der GPU
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		if  (foundNonce != 0xffffffff)
+		{
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], foundNonce);
+			animehash(vhash64, endiandata);
+
+			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
+
+				pdata[19] = foundNonce;
+				*hashes_done = (foundNonce - first_nonce + 1)/2;
+				return 1;
+			} else {
+				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = (pdata[19] - first_nonce + 1)/2;
+	return 0;
+}
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@ -1,140 +1,9 @@
				@@ -1,140 +1,9 @@
 #if 1

-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-
-// Endian Drehung für 32 Bit Typen
-/*
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-    return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
-          | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
-}
-*/
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-// das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
-}
-
-#if 0
-// Endian Drehung für 64 Bit Typen
-static __device__ unsigned long long cuda_swab64(unsigned long long x) {
-    uint32_t h = (x >> 32);
-    uint32_t l = (x & 0xFFFFFFFFULL);
-    return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
-}
-
-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const unsigned long long &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const unsigned long long &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-
-static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
-{
-#if __CUDA_ARCH__ >= 130
-    return __double_as_longlong(__hiloint2double(HI, LO));
-#else
-	return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
-#endif
-}
-
-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
-}
-#endif
-
-// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
-#if 1
-typedef unsigned long long uint64_t;
-#else
-typedef class uint64
-{
-public:
-	__device__ uint64()
-	{
-	}
-	__device__ uint64(unsigned long long init)
-	{
-		val = make_uint2( LOWORD(init), HIWORD(init) );
-	}
-	__device__ uint64(uint32_t lo, uint32_t hi)
-	{
-		val = make_uint2( lo, hi );
-	}
-	__device__ const uint64 operator^(uint64 const& rhs) const
-	{
-		return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
-	}
-	__device__ const uint64 operator|(uint64 const& rhs) const
-	{
-		return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
-	}
-	__device__ const uint64 operator+(unsigned long long const& rhs) const
-	{
-		return *this+uint64(rhs);
-	}
-	__device__ const uint64 operator+(uint64 const& rhs) const
-	{
-		uint64 res;
-		asm ("add.cc.u32      %0, %2, %4;\n\t"
-			 "addc.cc.u32     %1, %3, %5;\n\t"
-			 : "=r"(res.val.x), "=r"(res.val.y)
-			 : "r"(    val.x), "r"(    val.y),
-			   "r"(rhs.val.x), "r"(rhs.val.y));
-		return res;
-	}
-	__device__ const uint64 operator-(uint64 const& rhs) const
-	{
-		uint64 res;
-		asm ("sub.cc.u32      %0, %2, %4;\n\t"
-			 "subc.cc.u32     %1, %3, %5;\n\t"
-			 : "=r"(res.val.x), "=r"(res.val.y)
-			 : "r"(    val.x), "r"(    val.y),
-			   "r"(rhs.val.x), "r"(rhs.val.y));
-		return res;
-	}
-	__device__ const uint64 operator<<(int n) const
-	{
-		return uint64(unsigned long long(*this)<<n);
-	}
-	__device__ const uint64 operator>>(int n) const
-	{
-		return uint64(unsigned long long(*this)>>n);
-	}
-	__device__ operator unsigned long long() const
-	{
-		return MAKE_ULONGLONG(val.x, val.y);
-	}
-	uint2 val;
-} uint64_t;
-#endif
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 // die Message it Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)

-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
 #define SHL(x, n)            ((x) << (n))
 #define SHR(x, n)            ((x) >> (n))

--- a/quark/cuda_checkhash.cu
+++ b/quark/cuda_checkhash.cu
@ -1,11 +1,8 @@
				@@ -1,11 +1,8 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
-#include <stdint.h>
 #include <memory.h>

+#include "cuda_helper.h"
+
 // Hash Target gegen das wir testen sollen
 __constant__ uint32_t pTarget[8];

@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
				@@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
 }

 // Setup-Funktionen
-__host__ void quark_check_cpu_init(int thr_id, int threads)
+__host__ void cuda_check_cpu_init(int thr_id, int threads)
 {
    cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
    cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
 }

 // Target Difficulty setzen
-__host__ void quark_check_cpu_setTarget(const void *ptarget)
+__host__ void cuda_check_cpu_setTarget(const void *ptarget)
 {
 	// die Message zur Berechnung auf der GPU
 	cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }

-__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+__host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-#include <stdint.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -1,16 +1,11 @@
				@@ -1,16 +1,11 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-#define USE_SHUFFLE 0
+#include "cuda_helper.h"

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#define ROTR(x,n) ROTR64(x,n)
+
+#define USE_SHUFFLE 0

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
				@@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };

-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-#if 0
-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
-}
-#endif
-
-__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
-{
-	// Input:	77665544 33221100
-	// Output:	00112233 44556677
-	uint64_t temp[2];
-	temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
-	temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
-
-	return temp[0] | (temp[1]<<32);
-}
-
-__constant__ uint64_t c_u512[16];
-
-const uint64_t host_u512[16] =
+__device__ __constant__
+const uint64_t c_u512[16] =
 {
  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 
  0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
				@@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
  0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };

-
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset < 32) {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTR(x, n)        (((x) >> (n)) | ((x) << (64 - (n))))
-#endif
-
 #define G(a,b,c,d,e)          \
    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
    v[d] = ROTR( v[d] ^ v[a],32);        \
@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
				@@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
    v[b] = ROTR( v[b] ^ v[c],11);


-__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
+__device__ static
+void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
 {
    uint64_t v[16], m[16], i;

 #pragma unroll 16
-    for( i = 0; i < 16; ++i )
-    {
-        m[i] = SWAP64(block[i]);
+    for( i = 0; i < 16; ++i ) {
+        m[i] = cuda_swab64(block[i]);
    }

 #pragma unroll 8
@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
				@@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
    for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
 }

-// Endian Drehung für 32 Bit Typen
-
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-/*
-// Endian Drehung für 64 Bit Typen
-static __device__ uint64_t cuda_swab64(uint64_t x) {
-    uint32_t h = (x >> 32);
-    uint32_t l = (x & 0xFFFFFFFFULL);
-    return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
-}
-*/
-
-static __constant__ uint64_t d_constMem[8];
-static const uint64_t h_constMem[8] = {
+__device__ __constant__
+static const uint64_t d_constMem[8] = {
 	0x6a09e667f3bcc908ULL,
 	0xbb67ae8584caa73bULL,
 	0x3c6ef372fe94f82bULL,
@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
				@@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
 	0x5be0cd19137e2179ULL };

 // Hash-Padding
-static __constant__ uint64_t d_constHashPadding[8];
-static const uint64_t h_constHashPadding[8] = {
+__device__ __constant__
+static const uint64_t d_constHashPadding[8] = {
 	0x0000000000000080ull,
 	0,
 	0,
@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
				@@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
 	0,
 	0x0002000000000000ull };

-__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
+__global__ __launch_bounds__(256, 4)
+void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);

@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
				@@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
 	if (thread < threads)
 #endif
 	{
+		uint8_t i;
 		// bestimme den aktuellen Zähler
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

 		int hashPosition = nounce - startNounce;
-		//uint64_t *inpHash = &g_hash[8 * hashPosition];
-		uint64_t *inpHash = &g_hash[hashPosition<<3];
+		uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
+
+		// 128 Byte für die Message
+		uint64_t buf[16];

 		// State vorbereiten
 		uint64_t h[8];
-		/*
-		h[0] = 0x6a09e667f3bcc908ULL;
-		h[1] = 0xbb67ae8584caa73bULL;
-		h[2] = 0x3c6ef372fe94f82bULL;
-		h[3] = 0xa54ff53a5f1d36f1ULL;
-		h[4] = 0x510e527fade682d1ULL;
-		h[5] = 0x9b05688c2b3e6c1fULL;
-		h[6] = 0x1f83d9abfb41bd6bULL;
-		h[7] = 0x5be0cd19137e2179ULL;
-		*/
-#pragma unroll 8
-		for(int i=0;i<8;i++)
+		#pragma unroll 8
+		for (i=0;i<8;i++)
 			h[i] = d_constMem[i];

-		// 128 Byte für die Message
-		uint64_t buf[16];
-
 		// Message für die erste Runde in Register holen
-#pragma unroll 8
-		for (int i=0; i < 8; ++i) buf[i] = inpHash[i];
-
-		/*
-		buf[ 8] = 0x0000000000000080ull;
-		buf[ 9] = 0;
-		buf[10] = 0;
-		buf[11] = 0;
-		buf[12] = 0;
-		buf[13] = 0x0100000000000000ull;
-		buf[14] = 0;
-		buf[15] = 0x0002000000000000ull;
-		*/
-#pragma unroll 8
-		for(int i=0;i<8;i++)
+		#pragma unroll 8
+		for (i=0; i < 8; ++i)
+			buf[i] = inpHash[i];
+
+		#pragma unroll 8
+		for (i=0; i < 8; i++)
 			buf[i+8] = d_constHashPadding[i];

 		// die einzige Hashing-Runde
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );

-		// Hash rauslassen
 #if __CUDA_ARCH__ >= 130
 		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
 		uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
-#pragma unroll 8
-		for (int i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
+		#pragma unroll 8
+		for (i=0; i < 8; ++i) {
+			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
 		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = &g_hash[8 * hashPosition];
-#pragma unroll 8
-		for (int i=0; i < 8; ++i)
+		#pragma unroll 8
+		for (i=0; i < 8; ++i)
 		{
-			//outHash[i] = cuda_swab64( h[i] );
-			outHash[i] = SWAP64(h[i]);
+			outHash[i] = cuda_swab64(h[i]);
 		}
 #endif
 	}
@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
				@@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// bestimme den aktuellen Zähler
-		uint32_t nounce = startNounce + thread;
-
 		// State vorbereiten
 		uint64_t h[8];
-		/*
-		h[0] = 0x6a09e667f3bcc908ULL;
-		h[1] = 0xbb67ae8584caa73bULL;
-		h[2] = 0x3c6ef372fe94f82bULL;
-		h[3] = 0xa54ff53a5f1d36f1ULL;
-		h[4] = 0x510e527fade682d1ULL;
-		h[5] = 0x9b05688c2b3e6c1fULL;
-		h[6] = 0x1f83d9abfb41bd6bULL;
-		h[7] = 0x5be0cd19137e2179ULL;
-		*/
-#pragma unroll 8
-		for(int i=0;i<8;i++)
-			h[i] = d_constMem[i];
 		// 128 Byte für die Message
 		uint64_t buf[16];
+		uint8_t i;
+		// bestimme den aktuellen Zähler
+		uint32_t nounce = startNounce + thread;
+
+		#pragma unroll 8
+		for(i=0;i<8;i++)
+			h[i] = d_constMem[i];

 		// Message für die erste Runde in Register holen
-#pragma unroll 16
-		for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
+		#pragma unroll 16
+		for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];

 		// die Nounce durch die thread-spezifische ersetzen
 		buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
				@@ -333,19 +224,17 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 #if __CUDA_ARCH__ >= 130
 		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
 		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
-#pragma unroll 8
-		for (int i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
+		#pragma unroll 8
+		for (i=0; i < 8; ++i) {
+			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
 		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
-#pragma unroll 8
-		for (int i=0; i < 8; ++i)
-		{
-			//outHash[i] = cuda_swab64( h[i] );
-			outHash[i] = SWAP64(h[i]);
+		#pragma unroll 8
+		for (i=0; i < 8; ++i) {
+			outHash[i] = cuda_swab64( h[i] );
 		}
 #endif
 	}
@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
				@@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
 						host_sigma,
 						sizeof(host_sigma),
 						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( c_u512,
-						host_u512,
-						sizeof(host_u512),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_constMem,
-						h_constMem,
-						sizeof(h_constMem),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_constHashPadding,
-						h_constHashPadding,
-						sizeof(h_constHashPadding),
-						0, cudaMemcpyHostToDevice);
 }

 // Blake512 für 80 Byte grosse Eingangsdaten
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@ -1,371 +1,368 @@
				@@ -1,371 +1,368 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "sm_30_intrinsics.h"
-
-#include <stdio.h>
-#include <memory.h>
-#include <stdint.h>
-
-// aus cpu-miner.c
-extern int device_map[8];
-
-// diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props[8];
-
-static uint32_t *d_tempBranch1Nonces[8];
-static uint32_t *d_numValid[8];
-static uint32_t *h_numValid[8];
-
-static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
-
-// aus heavy.cu
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
-// True/False tester
-typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
-
-__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
-{
-	return ((inpHash[0] & 0x08) == 0x08);
-}
-
-__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
-{
-	return ((inpHash[0] & 0x08) == 0);
-}
-
-__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
-cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
-
-// Setup-Funktionen
-__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
-{
-	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
-	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
-	cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
-
-	// wir brauchen auch Speicherplatz auf dem Device
-	cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);	
-	cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
-	cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
-
-	uint32_t s1;
-	s1 = (threads / 256) * 2;
-
-	cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
-	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
-}
-
-#if __CUDA_ARCH__ < 300
-/**
- * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
- */
-#undef __shfl_up
-#define __shfl_up(var, delta, width) (0)
-#endif
-
-// Die Summenfunktion (vom NVIDIA SDK)
-__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
-{
-	extern __shared__ uint32_t sums[];
-	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
-	//int lane_id = id % warpSize;
-	int lane_id = id % width;
-	// determine a warp_id within a block
-	 //int warp_id = threadIdx.x / warpSize;
-	int warp_id = threadIdx.x / width;
-
-	sums[lane_id] = 0;
-
-	// Below is the basic structure of using a shfl instruction
-	// for a scan.
-	// Record "value" as a variable - we accumulate it along the way
-	uint32_t value;
-	if(testFunc != NULL)
-	{
-		if (id < threads)
-		{
-			uint32_t *inpHash;
-			if(d_validNonceTable == NULL)
-			{
-				// keine Nonce-Liste
-				inpHash = &inpHashes[id<<4];
-			}else
-			{
-				// Nonce-Liste verfügbar
-				int nonce = d_validNonceTable[id] - startNounce;
-				inpHash = &inpHashes[nonce<<4];
-			}			
-			value = (*testFunc)(inpHash);
-		}else
-		{
-			value = 0;
-		}
-	}else
-	{
-		value = data[id];
-	}
-
-	__syncthreads();
-
-	// Now accumulate in log steps up the chain
-	// compute sums, with another thread's value who is
-	// distance delta away (i).  Note
-	// those threads where the thread 'i' away would have
-	// been out of bounds of the warp are unaffected.  This
-	// creates the scan sum.
-#pragma unroll
-
-	for (int i=1; i<=width; i*=2)
-	{
-		uint32_t n = __shfl_up((int)value, i, width);
-
-		if (lane_id >= i) value += n;
-	}
-
-	// value now holds the scan value for the individual thread
-	// next sum the largest values for each warp
-
-	// write the sum of the warp to smem
-	//if (threadIdx.x % warpSize == warpSize-1)
-	if (threadIdx.x % width == width-1)
-	{
-		sums[warp_id] = value;
-	}
-
-	__syncthreads();
-
-	//
-	// scan sum the warp sums
-	// the same shfl scan operation, but performed on warp sums
-	//
-	if (warp_id == 0)
-	{
-		uint32_t warp_sum = sums[lane_id];
-
-		for (int i=1; i<=width; i*=2)
-		{
-			uint32_t n = __shfl_up((int)warp_sum, i, width);
-
-		if (lane_id >= i) warp_sum += n;
-		}
-
-		sums[lane_id] = warp_sum;
-	}
-
-	__syncthreads();
-
-	// perform a uniform add across warps in the block
-	// read neighbouring warp's sum and add it to threads value
-	uint32_t blockSum = 0;
-
-	if (warp_id > 0)
-	{
-		blockSum = sums[warp_id-1];
-	}
-
-	value += blockSum;
-
-	// Now write out our result
-	data[id] = value;
-
-	// last thread has sum, write write out the block's sum
-	if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
-	{
-		partial_sums[blockIdx.x] = value;
-	}
-}
-
-// Uniform add: add partial sums array
-__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
-{
-	__shared__ uint32_t buf;
-	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
-
-	if (id > len) return;
-
-	if (threadIdx.x == 0)
-	{
-		buf = partial_sums[blockIdx.x];
-	}
-
-	__syncthreads();
-	data[id] += buf;
-}
-
-// Der Scatter
-__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
-{
-	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
-	uint32_t actNounce = id;
-	uint32_t value;
-	if (id < threads)
-	{
-//		uint32_t nounce = startNounce + id;
-		uint32_t *inpHash;
-		if(d_validNonceTable == NULL)
-		{
-			// keine Nonce-Liste
-			inpHash = &inpHashes[id<<4];
-		}else
-		{
-			// Nonce-Liste verfügbar
-			int nonce = d_validNonceTable[id] - startNounce;
-			actNounce = nonce;
-			inpHash = &inpHashes[nonce<<4];
-		}
-
-		value = (*testFunc)(inpHash);
-	}else
-	{
-		value = 0;
-	}
-
-	if( value )
-	{
-		int idx = sum[id];
-		if(idx > 0)
-			outp[idx-1] = startNounce + actNounce;
-	}
-}
-
-__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
-{
-	if(val == 0)
-		return 0;
-
-	uint32_t mask = 0x80000000;
-	while( (val & mask) == 0 ) mask = mask >> 1;
-
-	if( (val & (~mask)) != 0 )
-		return mask << 1;
-
-	return mask;
-}
-
-__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
-														uint32_t *d_nonces1, cuda_compactTestFunction_t function,
-														uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
-{
-	int orgThreads = threads;
-	threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
-	// threadsPerBlock ausrechnen
-	int blockSize = 256;
-	int nSummen = threads / blockSize;
-
-	int thr1 = (threads+blockSize-1) / blockSize;
-	int thr2 = threads / (blockSize*blockSize);
-	int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
-	int thr3 = (nSummen + blockSize2-1) / blockSize2;
-
-	bool callThrid = (thr2 > 0) ? true : false;
-
-	// Erster Initialscan
-	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
-		d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);	
-
-	// weitere Scans
-	if(callThrid)
-	{		
-		quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
-		quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
-	}else
-	{
-		quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
-	}
-
-	// Sync + Anzahl merken
-	cudaStreamSynchronize(NULL);
-
-	if(callThrid)
-		cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	else
-		cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-	
-	// Addieren
-	if(callThrid)
-	{
-		quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
-	}
-	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
-	
-	// Scatter
-	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
-		function, orgThreads, startNounce, inpHashes, d_validNonceTable);
-
-	// Sync
-	cudaStreamSynchronize(NULL);
-}
-
-////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
-__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
-													 uint32_t *d_nonces1, uint32_t *d_nonces2,
-													 uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
-{
-	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
-	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
-
-	/*
-	// threadsPerBlock ausrechnen
-	int blockSize = 256;
-	int thr1 = threads / blockSize;
-	int thr2 = threads / (blockSize*blockSize);
-
-	// 1
-	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
-	quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
-	quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
-	cudaStreamSynchronize(NULL);
-	cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
-	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
-
-	// 2
-	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
-	quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
-	quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
-	cudaStreamSynchronize(NULL);
-	cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);	
-	quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
-	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
-	
-	// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
-	// Schritt 3: Scatter
-	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
-	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
-	cudaStreamSynchronize(NULL);
-	*/
-}
-
-__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											uint32_t *d_nonces2, size_t *nrm2,
-											int order)
-{
-	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
-	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
-	
-	quark_compactTest_cpu_dualCompaction(thr_id, threads,
-		h_numValid[thr_id], d_nonces1, d_nonces2,
-		startNounce, inpHashes, d_validNonceTable);
-
-	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
-	*nrm1 = (size_t)h_numValid[thr_id][0];
-	*nrm2 = (size_t)h_numValid[thr_id][1];
-}
-
-__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
-											uint32_t *d_nonces1, size_t *nrm1,
-											int order)
-{
-	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
-	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
-
-	quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
-
-	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
-	*nrm1 = (size_t)h_numValid[thr_id][0];
-}
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+#include <sm_30_intrinsics.h>
+
+// aus cpu-miner.c
+extern int device_map[8];
+
+// diese Struktur wird in der Init Funktion angefordert
+static cudaDeviceProp props[8];
+
+static uint32_t *d_tempBranch1Nonces[8];
+static uint32_t *d_numValid[8];
+static uint32_t *h_numValid[8];
+
+static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
+
+// aus heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+// True/False tester
+typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
+
+__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
+{
+	return ((inpHash[0] & 0x08) == 0x08);
+}
+
+__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
+{
+	return ((inpHash[0] & 0x08) == 0);
+}
+
+__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
+cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
+
+// Setup-Funktionen
+__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
+{
+	cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
+
+	cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
+	cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
+
+	// wir brauchen auch Speicherplatz auf dem Device
+	cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);	
+	cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
+	cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
+
+	uint32_t s1;
+	s1 = (threads / 256) * 2;
+
+	cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
+	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
+}
+
+#if __CUDA_ARCH__ < 300
+/**
+ * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
+ */
+#undef __shfl_up
+#define __shfl_up(var, delta, width) (0)
+#endif
+
+// Die Summenfunktion (vom NVIDIA SDK)
+__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+{
+	extern __shared__ uint32_t sums[];
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	//int lane_id = id % warpSize;
+	int lane_id = id % width;
+	// determine a warp_id within a block
+	 //int warp_id = threadIdx.x / warpSize;
+	int warp_id = threadIdx.x / width;
+
+	sums[lane_id] = 0;
+
+	// Below is the basic structure of using a shfl instruction
+	// for a scan.
+	// Record "value" as a variable - we accumulate it along the way
+	uint32_t value;
+	if(testFunc != NULL)
+	{
+		if (id < threads)
+		{
+			uint32_t *inpHash;
+			if(d_validNonceTable == NULL)
+			{
+				// keine Nonce-Liste
+				inpHash = &inpHashes[id<<4];
+			}else
+			{
+				// Nonce-Liste verfügbar
+				int nonce = d_validNonceTable[id] - startNounce;
+				inpHash = &inpHashes[nonce<<4];
+			}			
+			value = (*testFunc)(inpHash);
+		}else
+		{
+			value = 0;
+		}
+	}else
+	{
+		value = data[id];
+	}
+
+	__syncthreads();
+
+	// Now accumulate in log steps up the chain
+	// compute sums, with another thread's value who is
+	// distance delta away (i).  Note
+	// those threads where the thread 'i' away would have
+	// been out of bounds of the warp are unaffected.  This
+	// creates the scan sum.
+#pragma unroll
+
+	for (int i=1; i<=width; i*=2)
+	{
+		uint32_t n = __shfl_up((int)value, i, width);
+
+		if (lane_id >= i) value += n;
+	}
+
+	// value now holds the scan value for the individual thread
+	// next sum the largest values for each warp
+
+	// write the sum of the warp to smem
+	//if (threadIdx.x % warpSize == warpSize-1)
+	if (threadIdx.x % width == width-1)
+	{
+		sums[warp_id] = value;
+	}
+
+	__syncthreads();
+
+	//
+	// scan sum the warp sums
+	// the same shfl scan operation, but performed on warp sums
+	//
+	if (warp_id == 0)
+	{
+		uint32_t warp_sum = sums[lane_id];
+
+		for (int i=1; i<=width; i*=2)
+		{
+			uint32_t n = __shfl_up((int)warp_sum, i, width);
+
+		if (lane_id >= i) warp_sum += n;
+		}
+
+		sums[lane_id] = warp_sum;
+	}
+
+	__syncthreads();
+
+	// perform a uniform add across warps in the block
+	// read neighbouring warp's sum and add it to threads value
+	uint32_t blockSum = 0;
+
+	if (warp_id > 0)
+	{
+		blockSum = sums[warp_id-1];
+	}
+
+	value += blockSum;
+
+	// Now write out our result
+	data[id] = value;
+
+	// last thread has sum, write write out the block's sum
+	if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
+	{
+		partial_sums[blockIdx.x] = value;
+	}
+}
+
+// Uniform add: add partial sums array
+__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
+{
+	__shared__ uint32_t buf;
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+
+	if (id > len) return;
+
+	if (threadIdx.x == 0)
+	{
+		buf = partial_sums[blockIdx.x];
+	}
+
+	__syncthreads();
+	data[id] += buf;
+}
+
+// Der Scatter
+__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
+{
+	int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
+	uint32_t actNounce = id;
+	uint32_t value;
+	if (id < threads)
+	{
+//		uint32_t nounce = startNounce + id;
+		uint32_t *inpHash;
+		if(d_validNonceTable == NULL)
+		{
+			// keine Nonce-Liste
+			inpHash = &inpHashes[id<<4];
+		}else
+		{
+			// Nonce-Liste verfügbar
+			int nonce = d_validNonceTable[id] - startNounce;
+			actNounce = nonce;
+			inpHash = &inpHashes[nonce<<4];
+		}
+
+		value = (*testFunc)(inpHash);
+	}else
+	{
+		value = 0;
+	}
+
+	if( value )
+	{
+		int idx = sum[id];
+		if(idx > 0)
+			outp[idx-1] = startNounce + actNounce;
+	}
+}
+
+__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
+{
+	if(val == 0)
+		return 0;
+
+	uint32_t mask = 0x80000000;
+	while( (val & mask) == 0 ) mask = mask >> 1;
+
+	if( (val & (~mask)) != 0 )
+		return mask << 1;
+
+	return mask;
+}
+
+__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
+														uint32_t *d_nonces1, cuda_compactTestFunction_t function,
+														uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+{
+	int orgThreads = threads;
+	threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
+	// threadsPerBlock ausrechnen
+	int blockSize = 256;
+	int nSummen = threads / blockSize;
+
+	int thr1 = (threads+blockSize-1) / blockSize;
+	int thr2 = threads / (blockSize*blockSize);
+	int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
+	int thr3 = (nSummen + blockSize2-1) / blockSize2;
+
+	bool callThrid = (thr2 > 0) ? true : false;
+
+	// Erster Initialscan
+	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
+		d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);	
+
+	// weitere Scans
+	if(callThrid)
+	{		
+		quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
+		quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
+	}else
+	{
+		quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
+	}
+
+	// Sync + Anzahl merken
+	cudaStreamSynchronize(NULL);
+
+	if(callThrid)
+		cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	else
+		cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	
+	// Addieren
+	if(callThrid)
+	{
+		quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
+	}
+	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
+	
+	// Scatter
+	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, 
+		function, orgThreads, startNounce, inpHashes, d_validNonceTable);
+
+	// Sync
+	cudaStreamSynchronize(NULL);
+}
+
+////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
+__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
+													 uint32_t *d_nonces1, uint32_t *d_nonces2,
+													 uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
+{
+	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
+	quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
+
+	/*
+	// threadsPerBlock ausrechnen
+	int blockSize = 256;
+	int thr1 = threads / blockSize;
+	int thr2 = threads / (blockSize*blockSize);
+
+	// 1
+	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
+	quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
+	quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
+	cudaStreamSynchronize(NULL);
+	cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
+	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
+
+	// 2
+	quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
+	quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
+	quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
+	cudaStreamSynchronize(NULL);
+	cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);	
+	quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
+	quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
+	
+	// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
+	// Schritt 3: Scatter
+	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
+	quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
+	cudaStreamSynchronize(NULL);
+	*/
+}
+
+__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+											uint32_t *d_nonces1, size_t *nrm1,
+											uint32_t *d_nonces2, size_t *nrm2,
+											int order)
+{
+	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
+	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
+	
+	quark_compactTest_cpu_dualCompaction(thr_id, threads,
+		h_numValid[thr_id], d_nonces1, d_nonces2,
+		startNounce, inpHashes, d_validNonceTable);
+
+	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
+	*nrm1 = (size_t)h_numValid[thr_id][0];
+	*nrm2 = (size_t)h_numValid[thr_id][1];
+}
+
+__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+											uint32_t *d_nonces1, size_t *nrm1,
+											int order)
+{
+	// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
+	// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
+
+	quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
+
+	cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
+	*nrm1 = (size_t)h_numValid[thr_id][0];
+}
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -1,23 +1,16 @@
				@@ -1,23 +1,16 @@
 // Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice

-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

+#include "cuda_helper.h"
+
 // aus cpu-miner.c
 extern int device_map[8];

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];

@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
				@@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"

-__global__ void __launch_bounds__(256, 4)
- quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(256, 4)
+void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
    int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@ -1,27 +1,19 @@
				@@ -1,27 +1,19 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-#include "cuda_helper.h"
-
 #define U32TO64_LE(p) \
    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))

 #define U64TO32_LE(p, v) \
    *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);

-static const uint64_t host_keccak_round_constants[24] = {
+__device__ __constant__
+static const uint64_t c_keccak_round_constants[24] = {
    0x0000000000000001ull, 0x0000000000008082ull,
    0x800000000000808aull, 0x8000000080008000ull,
    0x000000000000808bull, 0x0000000080000001ull,
@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
				@@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
    0x0000000080000001ull, 0x8000000080008008ull
 };

-__constant__ uint64_t c_keccak_round_constants[24];
-
 static __device__ __forceinline__ void
 keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
    size_t i;
@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
				@@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
 // Setup-Funktionen
 __host__ void quark_keccak512_cpu_init(int thr_id, int threads)
 {
-    // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol( c_keccak_round_constants,
-                        host_keccak_round_constants,
-                        sizeof(host_keccak_round_constants),
-                        0, cudaMemcpyHostToDevice);
 }

 __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@ -1,16 +1,8 @@
				@@ -1,16 +1,8 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
+#include <stdint.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
+#include "cuda_helper.h"

 // aus cpu-miner.c
 extern "C" extern int device_map[8];
@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t

 // Take a look at: https://www.schneier.com/skein1.3.pdf

-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
 #define SHL(x, n)			((x) << (n))
 #define SHR(x, n)			((x) >> (n))

--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -1,4 +1,3 @@
				@@ -1,4 +1,3 @@
-
 extern "C"
 {
 #include "sph/sph_blake.h"
@ -8,9 +7,9 @@ extern "C"
				@@ -8,9 +7,9 @@ extern "C"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "miner.h"
-}

-#include <stdint.h>
+#include "cuda_helper.h"
+}

 // aus cpu-miner.c
 extern int device_map[8];
@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
				@@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void quark_jh512_cpu_init(int thr_id, int threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
				@@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,

 		// Konstanten kopieren, Speicher belegen
 		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		quark_compactTest_cpu_init(thr_id, throughput);
+
 		cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
+
 		init[thr_id] = true;
 	}

@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
				@@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
				@@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/util.c
+++ b/util.c
@ -1352,6 +1352,10 @@ void print_hash_tests(void)
				@@ -1352,6 +1352,10 @@ void print_hash_tests(void)
 	myriadhash(&hash[0], &buf[0]);
 	printf("\nmyriad:  "); print_hash(hash);

+	memset(hash, 0, sizeof hash);
+	nist5hash(&hash[0], &buf[0]);
+	printf("\nnist5:   "); print_hash(hash);
+
 	memset(hash, 0, sizeof hash);
 	quarkhash(&hash[0], &buf[0]);
 	printf("\nquark:   "); print_hash(hash);
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@ -1,30 +1,13 @@
				@@ -1,30 +1,13 @@
-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

 typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-#if 0
-__device__ static uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-#endif
-
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;

 #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
 #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */

-typedef unsigned int uint32_t; /* must be exactly 32 bits */
-
 #define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
 #define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
 #define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@ -1,33 +1,7 @@
				@@ -1,33 +1,7 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
-#include <stdint.h>
 #include <memory.h>

-// das Hi Word aus einem 64 Bit Typen extrahieren
-#if 0
-static __device__ uint32_t HIWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-#endif
-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@ -18,28 +18,18 @@
				@@ -18,28 +18,18 @@
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

 typedef unsigned char BitSequence;

-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
 typedef struct {
    uint32_t buffer[8]; /* Buffer to be hashed */
    uint32_t chainv[40];   /* Chaining values */
 } hashState;

- __device__ __forceinline__
-static uint32_t BYTES_SWAP32(uint32_t x)
-{
-	return __byte_perm(x, x, 0x0123);
-}
-
 #define MULT2(a,j)\
    tmp = a[7+(8*j)];\
    a[7+(8*j)] = a[6+(8*j)];\
@ -289,11 +279,11 @@ __device__ __forceinline__
				@@ -289,11 +279,11 @@ __device__ __forceinline__
 void Update512(hashState *state, const BitSequence *data)
 {
 #pragma unroll 8
-    for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
+    for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
    rnd512(state);

 #pragma unroll 8
-    for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
+    for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
    rnd512(state);
 }

@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
				@@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
        for(j=0;j<5;j++) {
            b[i] ^= state->chainv[i+8*j];
        }
-        b[i] = BYTES_SWAP32((b[i]));
+        b[i] = cuda_swab32((b[i]));
    }

 #pragma unroll 8
@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
				@@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
        for(j=0;j<5;j++) {
            b[8+i] ^= state->chainv[i+8*j];
        }
-        b[8+i] = BYTES_SWAP32((b[8+i]));
+        b[8 + i] = cuda_swab32((b[8 + i]));
    }
 }

--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@ -1,18 +1,13 @@
				@@ -1,18 +1,13 @@
-#include <stdint.h>
-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
+//typedef unsigned char BitSequence;
+//typedef unsigned long long DataLength;

-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-static __constant__ uint32_t d_ShaviteInitVector[16];
-static const uint32_t h_ShaviteInitVector[] = {
+__device__ __constant__
+static const uint32_t d_ShaviteInitVector[16] = {
 	SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
 	SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
 	SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
@ -1304,18 +1299,18 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -1304,18 +1299,18 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui

 	aes_gpu_init(sharedMemory);

-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];

 		// kopiere init-state
 		uint32_t state[16];

-#pragma unroll 16
+		#pragma unroll 16
 		for(int i=0;i<16;i++)
 			state[i] = d_ShaviteInitVector[i];

@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -1323,13 +1318,13 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		uint32_t msg[32];

 		// fülle die Nachricht mit 64-byte (vorheriger Hash)
-#pragma unroll 16
+		#pragma unroll 16
 		for(int i=0;i<16;i++)
 			msg[i] = Hash[i];			

 		// Nachrichtenende
 		msg[16] = 0x80;
-#pragma unroll 10
+		#pragma unroll 10
 		for(int i=17;i<27;i++)
 			msg[i] = 0;

@ -1341,10 +1336,10 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -1341,10 +1336,10 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui

 		c512(sharedMemory, state, msg);

-#pragma unroll 16
+		#pragma unroll 16
 		for(int i=0;i<16;i++)
 			Hash[i] = state[i];
-    }
+	}
 }


@ -1352,25 +1347,19 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -1352,25 +1347,19 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
 {
 	aes_cpu_init();
-
-	cudaMemcpyToSymbol( d_ShaviteInitVector,
-                        h_ShaviteInitVector,
-                        sizeof(h_ShaviteInitVector),
-                        0, cudaMemcpyHostToDevice);
 }

 __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    const int threadsperblock = 256;
+	const int threadsperblock = 256;

-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);

-    // Größe des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	// Größe des dynamischen Shared Memory Bereichs
+	size_t shared_size = 0;

-    x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+	x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	MyStreamSynchronize(NULL, order, thr_id);
 }
-
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@ -7,29 +7,17 @@
				@@ -7,29 +7,17 @@

 #define TPB 256

+#include "cuda_helper.h"
+
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
 int *d_state[8];
 uint4 *d_temp4[8];

 // texture bound to d_temp4[thr_id], for read access in Compaction kernel
 texture<uint4, 1, cudaReadModeElementType> texRef1D_128;

-#define C32(x)    ((uint32_t)(x ## U))
-#define T32(x) ((x) & C32(0xFFFFFFFF))
-
-#if __CUDA_ARCH__ < 350 
-    // Kepler (Compute 3.0)
-    #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-    // Kepler (Compute 3.5)
-    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
-
 __device__ __constant__
 const uint32_t c_IV_512[32] = {
  0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
				@@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
 #undef BUTTERFLY
 }

-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 /**
 * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
 * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \
				@@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \

 __device__ __forceinline__ void FFT_16(int *y) {

-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 #ifndef WIN32
 # warning FFT_16() function is not compatible with SM 2.1 devices!
 #endif
@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
				@@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
 __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
 {
  int i;
-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 #ifndef WIN32
 # warning Expansion() function is not compatible with SM 2.1 devices
 #endif
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -15,10 +15,11 @@ extern "C"
				@@ -15,10 +15,11 @@ extern "C"
 #include "sph/sph_echo.h"

 #include "miner.h"
-}
+#include "cuda_helper.h"

-#include <stdint.h>
-#include <cuda_helper.h>
+#include <stdio.h>
+#include <memory.h>
+}

 // aus cpu-miner.c
 extern int device_map[8];
@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
				@@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
 extern void x11_echo512_cpu_init(int thr_id, int threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, 
@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
				@@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);

 		init[thr_id] = true;
 	}
@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
				@@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		uint32_t foundNonce;
@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
				@@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@ -5,26 +5,11 @@
				@@ -5,26 +5,11 @@
 * heavily based on phm's sgminer
 *
 */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "device_launch_parameters.h"
-
-#include <stdint.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#if __CUDA_ARCH__ < 350
-// Kepler (Compute 3.0)
-#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-// Kepler (Compute 3.5, 5.0)
-#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
-
 /*
 * X13 kernel implementation.
 *
@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 * @author   phm <phm@inbox.com>
 */

-#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
-
 #define mixtab0(x) (*((uint32_t*)mixtabs + (    (x))))
 #define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
@ -584,97 +567,97 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
				@@ -584,97 +567,97 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint

 	__syncthreads();

-    int i;
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
-
-        #pragma unroll 16
-        for( i = 0; i < 16; i++ )
-            Hash[i] = SWAB32(Hash[i]);
-
-        uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
-        uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
-        uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
-        uint32_t S30, S31, S32, S33, S34, S35;
-
-        uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
-        uint64_t bc = (uint64_t) 64 << 3;
-        uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
-        uint32_t bchi = (uint32_t)(bc >> 32);
-
-        S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
-        S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); 
-        S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); 
-        S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
-        S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
-
-        FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
-        FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
-        FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
-        FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
-        FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
-        FUGUE512_3((Hash[0xF]), bchi, bclo);
-
-        #pragma unroll 32
-        for (i = 0; i < 32; i ++) {
-            ROR3;
-            CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
-            SMIX(S00, S01, S02, S03);
-        }
-        #pragma unroll 13
-        for (i = 0; i < 13; i ++) {
-            S04 ^= S00;
-            S09 ^= S00;
-            S18 ^= S00;
-            S27 ^= S00;
-            ROR9;
-            SMIX(S00, S01, S02, S03);
-            S04 ^= S00;
-            S10 ^= S00;
-            S18 ^= S00;
-            S27 ^= S00;
-            ROR9;
-            SMIX(S00, S01, S02, S03);
-            S04 ^= S00;
-            S10 ^= S00;
-            S19 ^= S00;
-            S27 ^= S00;
-            ROR9;
-            SMIX(S00, S01, S02, S03);
-            S04 ^= S00;
-            S10 ^= S00;
-            S19 ^= S00;
-            S28 ^= S00;
-            ROR8;
-            SMIX(S00, S01, S02, S03);
-        }
-        S04 ^= S00;
-        S09 ^= S00;
-        S18 ^= S00;
-        S27 ^= S00;
-
-        Hash[0] = SWAB32(S01);
-        Hash[1] = SWAB32(S02);
-        Hash[2] = SWAB32(S03);
-        Hash[3] = SWAB32(S04);
-        Hash[4] = SWAB32(S09);
-        Hash[5] = SWAB32(S10);
-        Hash[6] = SWAB32(S11);
-        Hash[7] = SWAB32(S12);
-        Hash[8] = SWAB32(S18);
-        Hash[9] = SWAB32(S19);
-        Hash[10] = SWAB32(S20);
-        Hash[11] = SWAB32(S21);
-        Hash[12] = SWAB32(S27);
-        Hash[13] = SWAB32(S28);
-        Hash[14] = SWAB32(S29);
-        Hash[15] = SWAB32(S30);
-    }
+	int i;
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
+
+		#pragma unroll 16
+		for( i = 0; i < 16; i++ )
+			Hash[i] = cuda_swab32(Hash[i]);
+
+		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
+		uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
+		uint32_t S20, S21, S22, S23, S24, S25, S26, S27, S28, S29;
+		uint32_t S30, S31, S32, S33, S34, S35;
+
+		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
+		uint64_t bc = (uint64_t) 64 << 3;
+		uint32_t bclo = (uint32_t)(bc & 0xFFFFFFFFULL);
+		uint32_t bchi = (uint32_t)(bc >> 32);
+
+		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
+		S20 = SPH_C32(0x8807a57e); S21 = SPH_C32(0xe616af75); S22 = SPH_C32(0xc5d3e4db); S23 = SPH_C32(0xac9ab027); 
+		S24 = SPH_C32(0xd915f117); S25 = SPH_C32(0xb6eecc54); S26 = SPH_C32(0x06e8020b); S27 = SPH_C32(0x4a92efd1); 
+		S28 = SPH_C32(0xaac6e2c9); S29 = SPH_C32(0xddb21398); S30 = SPH_C32(0xcae65838); S31 = SPH_C32(0x437f203f);
+		S32 = SPH_C32(0x25ea78e7); S33 = SPH_C32(0x951fddd6); S34 = SPH_C32(0xda6ed11d); S35 = SPH_C32(0xe13e3567);
+
+		FUGUE512_3((Hash[0x0]), (Hash[0x1]), (Hash[0x2]));
+		FUGUE512_3((Hash[0x3]), (Hash[0x4]), (Hash[0x5]));
+		FUGUE512_3((Hash[0x6]), (Hash[0x7]), (Hash[0x8]));
+		FUGUE512_3((Hash[0x9]), (Hash[0xA]), (Hash[0xB]));
+		FUGUE512_3((Hash[0xC]), (Hash[0xD]), (Hash[0xE]));
+		FUGUE512_3((Hash[0xF]), bchi, bclo);
+
+		#pragma unroll 32
+		for (i = 0; i < 32; i ++) {
+			ROR3;
+			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+			SMIX(S00, S01, S02, S03);
+		}
+		#pragma unroll 13
+		for (i = 0; i < 13; i ++) {
+			S04 ^= S00;
+			S09 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S27 ^= S00;
+			ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S28 ^= S00;
+			ROR8;
+			SMIX(S00, S01, S02, S03);
+		}
+		S04 ^= S00;
+		S09 ^= S00;
+		S18 ^= S00;
+		S27 ^= S00;
+
+		Hash[0] = cuda_swab32(S01);
+		Hash[1] = cuda_swab32(S02);
+		Hash[2] = cuda_swab32(S03);
+		Hash[3] = cuda_swab32(S04);
+		Hash[4] = cuda_swab32(S09);
+		Hash[5] = cuda_swab32(S10);
+		Hash[6] = cuda_swab32(S11);
+		Hash[7] = cuda_swab32(S12);
+		Hash[8] = cuda_swab32(S18);
+		Hash[9] = cuda_swab32(S19);
+		Hash[10] = cuda_swab32(S20);
+		Hash[11] = cuda_swab32(S21);
+		Hash[12] = cuda_swab32(S27);
+		Hash[13] = cuda_swab32(S28);
+		Hash[14] = cuda_swab32(S29);
+		Hash[15] = cuda_swab32(S30);
+	}
 }

 #define texDef(texname, texmem, texsource, texsize) \
@ -697,17 +680,17 @@ __host__ void x13_fugue512_cpu_init(int thr_id, int threads)
				@@ -697,17 +680,17 @@ __host__ void x13_fugue512_cpu_init(int thr_id, int threads)

 __host__ void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    const int threadsperblock = 256;
+	const int threadsperblock = 256;

-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);

-    // Größe des dynamischen Shared Memory Bereichs
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 4 * 256 * sizeof(uint32_t);

-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+	// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

-    x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+	x13_fugue512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@ -37,26 +37,11 @@
				@@ -37,26 +37,11 @@
 * @author   phm <phm@inbox.com>
 */

-#include <stdint.h>
-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
-
-#if __CUDA_ARCH__ < 350
-    // Kepler (Compute 3.0)
-    #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-    // Kepler (Compute 3.5)
-    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
-
 __device__ __constant__
 static const uint32_t d_alpha_n[] = {
 	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
@ -663,7 +648,7 @@ static const uint32_t d_T512[64][16] = {
				@@ -663,7 +648,7 @@ static const uint32_t d_T512[64][16] = {
 		mD = 0; \
 		mE = 0; \
 		mF = 0; \
-        for (u = 0; u < 8; u ++) { \
+		for (u = 0; u < 8; u ++) { \
 			unsigned db = buf(u); \
 			for (v = 0; v < 8; v ++, db >>= 1) { \
 				uint32_t dm = SPH_T32(-(uint32_t)(db & 1)); \
@ -692,45 +677,47 @@ static const uint32_t d_T512[64][16] = {
				@@ -692,45 +677,47 @@ static const uint32_t d_T512[64][16] = {
 // Die Hash-Funktion
 __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
-    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-        int hashPosition = nounce - startNounce;
-        uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
-        unsigned char *h1 = (unsigned char *)Hash;
-
-        uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
-        uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
-        uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
-        uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
-        uint32_t m0, m1, m2, m3, m4, m5, m6, m7;
-        uint32_t m8, m9, mA, mB, mC, mD, mE, mF;
-        uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
+
+		int hashPosition = nounce - startNounce;
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
+		unsigned char *h1 = (unsigned char *)Hash;
+
+		uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
+		uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
+		uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
+		uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
+		uint32_t m0, m1, m2, m3, m4, m5, m6, m7;
+		uint32_t m8, m9, mA, mB, mC, mD, mE, mF;
+		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };

 #define buf(u) (h1[i+u])
 		#pragma unroll 8
-        for(int i = 0; i < 64; i += 8) {
-            INPUT_BIG;
-            P_BIG;
-            T_BIG;
-        }
+		for(int i = 0; i < 64; i += 8) {
+			INPUT_BIG;
+			P_BIG;
+			T_BIG;
+		}
+
 #undef buf
 #define buf(u) (u == 0 ? 0x80 : 0)
-        INPUT_BIG;
-        P_BIG;
-        T_BIG;
+		INPUT_BIG;
+		P_BIG;
+		T_BIG;
+
 #undef buf
 #define buf(u) (u == 6 ? 2 : 0)
-        INPUT_BIG;
-        PF_BIG;
-        T_BIG;
+		INPUT_BIG;
+		PF_BIG;
+		T_BIG;

 		#pragma unroll 16
-        for (int i = 0; i < 16; i++)
-            Hash[i] = SWAB32(h[i]);
-    }
+		for (int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(h[i]);
+	}
 }


@ -740,18 +727,17 @@ __host__ void x13_hamsi512_cpu_init(int thr_id, int threads)
				@@ -740,18 +727,17 @@ __host__ void x13_hamsi512_cpu_init(int thr_id, int threads)

 __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    const int threadsperblock = 256;
+	const int threadsperblock = 256;

-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);

-    // Größe des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	// Größe des dynamischen Shared Memory Bereichs
+	size_t shared_size = 0;

-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+	// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

-    x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
-    MyStreamSynchronize(NULL, order, thr_id);
+	x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
+	MyStreamSynchronize(NULL, order, thr_id);
 }
-
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -20,10 +20,9 @@ extern "C"
				@@ -20,10 +20,9 @@ extern "C"
 #include "sph/sph_fugue.h"

 #include "miner.h"
-}

-#include <stdint.h>
-#include <cuda_helper.h>
+#include "cuda_helper.h"
+}

 // aus cpu-miner.c
 extern int device_map[8];
@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
				@@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
 extern void x13_fugue512_cpu_init(int thr_id, int threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, 
@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
				@@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x11_echo512_cpu_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);

 		init[thr_id] = true;
 	}
@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
				@@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		uint32_t foundNonce;
@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
				@@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x15/cuda_x14_shabal512.cu
+++ b/x15/cuda_x14_shabal512.cu
@ -1,26 +1,10 @@
				@@ -1,26 +1,10 @@
 /*
 * Shabal-512 for X14/X15 (STUB)
 */
-#include <stdint.h>
-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
-
-#if __CUDA_ARCH__ < 350
-	// Kepler (Compute 3.0)
-	#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-	// Kepler (Compute 3.5)
-	#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
-
 /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
 /*
 * Shabal implementation.
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@ -4,8 +4,8 @@
				@@ -4,8 +4,8 @@
 * tpruvot@github
 */
 #include <stdio.h>
-#include <stdint.h>
-#include <cuda_helper.h>
+
+#include "cuda_helper.h"

 #define NULLTEST 0

@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 #define SPH_64 (1)
 #define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1)

-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-
 // defined in cuda_helper.h
 #define SPH_ROTL64(x,n) ROTL64(x,n)

--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -22,10 +22,9 @@ extern "C" {
				@@ -22,10 +22,9 @@ extern "C" {
 #include "sph/sph_shabal.h"

 #include "miner.h"
-}

-#include <stdint.h>
-#include <cuda_helper.h>
+#include "cuda_helper.h"
+}

 // from cpu-miner.c
 extern int device_map[8];
@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
				@@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
 extern void x14_shabal512_cpu_init(int thr_id, int threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
				@@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);

-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}

@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
				@@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
				@@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if (foundNonce != 0xffffffff)
 		{
 			/* check now with the CPU to confirm */
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -23,10 +23,9 @@ extern "C" {
				@@ -23,10 +23,9 @@ extern "C" {
 #include "sph/sph_whirlpool.h"

 #include "miner.h"
-}

-#include <stdint.h>
-#include <cuda_helper.h>
+#include "cuda_helper.h"
+}

 // to test gpu hash on a null buffer
 #define NULLTEST 0
@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
				@@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
 extern void x15_whirlpool_cpu_init(int thr_id, int threads);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
				@@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x14_shabal512_cpu_init(thr_id, throughput);
 		x15_whirlpool_cpu_init(thr_id, throughput);

-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}

@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
				@@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
				@@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		print_hash((unsigned char*)buf); printf("\n");
 #endif
 		/* Scan with GPU */
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		if (foundNonce != 0xffffffff)
 		{