Remove duplicated defines present in cuda_helper.h

also add cudaDeviceReset() on Ctrl+C for nvprof
10 years ago · d9ea5f72ce
44 changed files with 1129 additions and 1619 deletions
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@ -1,11 +1,8 @@
				@@ -1,11 +1,8 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "sm_30_intrinsics.h"
-
 #include <stdio.h>
 #include <memory.h>
-#include <stdint.h>
+
+#include "cuda_helper.h"
+#include <sm_30_intrinsics.h>

 // aus cpu-miner.c
 extern int device_map[8];
@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
				@@ -60,7 +57,7 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }

-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 /**
 * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
 */
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@ -1,16 +1,7 @@
				@@ -1,16 +1,7 @@
-
-
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -18,28 +9,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 __constant__ uint64_t c_State[25];
 __constant__ uint32_t c_PaddedMessage[18];

-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
-
 #define U32TO64_LE(p) \
    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))

--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -1,4 +1,3 @@
				@@ -1,4 +1,3 @@
-
 extern "C"
 {
 #include "sph/sph_keccak.h"
@ -7,10 +6,9 @@ extern "C"
				@@ -7,10 +6,9 @@ extern "C"
 #include "sph/sph_jh.h"
 #include "sph/sph_skein.h"
 #include "miner.h"
+#include "cuda_helper.h"
 }

-#include <stdint.h>
-
 // aus cpu-miner.c
 extern int device_map[8];

@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
				@@ -33,9 +31,9 @@ extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
 extern void quark_skein512_cpu_init(int thr_id, int threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void jackpot_compactTest_cpu_init(int thr_id, int threads);
 extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
				@@ -121,7 +119,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2);
 		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
 		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
				@@ -134,7 +132,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	jackpot_keccak512_cpu_setBlock((void*)endiandata, 80);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
				@@ -214,14 +212,15 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 		}

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
+			unsigned int rounds;
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);

 			// diese jackpothash Funktion gibt die Zahl der Runden zurück
-			unsigned int rounds = jackpothash(vhash64, endiandata);
+			rounds = jackpothash(vhash64, endiandata);

 			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {

--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@ -1,5 +1,4 @@
				@@ -1,5 +1,4 @@
-
-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 /**
 * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
 * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
				@@ -175,7 +175,8 @@ copy "$(CudaToolkitBinDir)\cudart32*.dll" "$(OutDir)"</Command>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>false</Keep>
      <CodeGeneration>compute_50,sm_50</CodeGeneration>
-      <Defines></Defines>
+      <Defines>
+      </Defines>
    </CudaCompile>
    <CudaLink>
      <GPUDebugInfo>false</GPUDebugInfo>
@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
				@@ -312,6 +313,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
    <ClInclude Include="uint256.h" />
  </ItemGroup>
  <ItemGroup>
+    <CudaCompile Include="bitslice_transformations_quad.cu">
+      <ExcludedFromBuild>true</ExcludedFromBuild>
+    </CudaCompile>
    <CudaCompile Include="cuda_fugue256.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
				@@ -336,6 +340,9 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
+    <CudaCompile Include="groestl_functions_quad.cu">
+      <ExcludedFromBuild>true</ExcludedFromBuild>
+    </CudaCompile>
    <CudaCompile Include="heavy\cuda_blake512.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -391,5 +391,11 @@
				@@ -391,5 +391,11 @@
    <CudaCompile Include="x15\cuda_x15_whirlpool.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
+    <CudaCompile Include="groestl_functions_quad.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="bitslice_transformations_quad.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/cuda_fugue256.cu
+++ b/cuda_fugue256.cu
@ -1,12 +1,11 @@
				@@ -1,12 +1,11 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

 #include "sph/sph_fugue.h"

+#include "cuda_helper.h"
+#include <host_defines.h>
+
 #define USE_SHARED 1

 // aus cpu-miner.c
@ -15,14 +14,6 @@ extern int device_map[8];
				@@ -15,14 +14,6 @@ extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-// schon in sph_fugue.h definiert
-//#define SPH_C32(x)	((uint32_t)(x ## U))
-
 uint32_t *d_fugue256_hashoutput[8];
 uint32_t *d_resultNonce[8];

--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -1,23 +1,17 @@
				@@ -1,23 +1,17 @@
 // Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice

-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

+#include "cuda_helper.h"
+#include <host_defines.h>
+
 // aus cpu-miner.c
 extern int device_map[8];

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];

@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
				@@ -31,10 +25,10 @@ __constant__ uint32_t groestlcoin_gpu_msg[32];
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"

-#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+#define SWAB32(x) cuda_swab32(x)

-__global__ void __launch_bounds__(256, 4)
- groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
+__global__ __launch_bounds__(256, 4)
+void groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -1,33 +1,78 @@
				@@ -1,33 +1,78 @@
 #ifndef CUDA_HELPER_H
 #define CUDA_HELPER_H

+#include <cuda.h>
 #include <cuda_runtime.h>

-static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
-{
-#if __CUDA_ARCH__ >= 130
-    return __double_as_longlong(__hiloint2double(HI, LO));
+#if defined(_MSC_VER)
+/* reduce warnings */
+#include <device_functions.h>
+#include <device_launch_parameters.h>
+#endif
+
+#include <stdint.h>
+
+extern __device__ __device_builtin__ void __syncthreads(void);
+
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+
+#ifndef SPH_C32
+#define SPH_C32(x) ((uint32_t)(x ## U))
+#endif
+
+#ifndef SPH_C64
+#define SPH_C64(x) ((uint64_t)(x ## ULL))
+#endif
+
+#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
+
+#if __CUDA_ARCH__ < 350
+// Kepler (Compute 3.0)
+#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
 #else
-	return (unsigned long long)LO | (((unsigned long long)HI) << 32);
+// Kepler (Compute 3.5, 5.0)
+#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
 #endif
-}

-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const uint64_t &x) {
+__device__ __forceinline__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
+{
 #if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
+	return __double_as_longlong(__hiloint2double(HI, LO));
 #else
-	return (uint32_t)(x >> 32);
+	return (unsigned long long)LO | (((unsigned long long)HI) << 32);
 #endif
 }

 // das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
+__device__ __forceinline__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
 	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
 }

+// das Lo Word in einem 64 Bit Typen ersetzen
+__device__ __forceinline__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
+	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+}
+
+// Endian Drehung für 32 Bit Typen
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
+{
+	/* device */
+	return __byte_perm(x, x, 0x0123);
+}
+#else
+	/* host */
+	#define cuda_swab32(x) \
+	((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
+		(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+#endif
+
 // das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
+__device__ __forceinline__ uint32_t _LOWORD(const uint64_t &x) {
 #if __CUDA_ARCH__ >= 130
 	return (uint32_t)__double2loint(__longlong_as_double(x));
 #else
@ -35,25 +80,42 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
				@@ -35,25 +80,42 @@ static __device__ uint32_t LOWORD(const uint64_t &x) {
 #endif
 }

-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
+// das Hi Word aus einem 64 Bit Typen extrahieren
+__device__ __forceinline__ uint32_t _HIWORD(const uint64_t &x) {
+#if __CUDA_ARCH__ >= 130
+	return (uint32_t)__double2hiint(__longlong_as_double(x));
+#else
+	return (uint32_t)(x >> 32);
+#endif
 }

-// Endian Drehung für 32 Bit Typen
-static __device__ uint32_t cuda_swab32(uint32_t x)
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x)
 {
-	return __byte_perm(x, x, 0x0123);
-}
+	// Input:       77665544 33221100
+	// Output:      00112233 44556677
+	uint64_t temp[2];
+	temp[0] = __byte_perm(_HIWORD(x), 0, 0x0123);
+	temp[1] = __byte_perm(_LOWORD(x), 0, 0x0123);

-// Endian Drehung für 64 Bit Typen
-static __device__ uint64_t cuda_swab64(uint64_t x) {
-    return MAKE_ULONGLONG(cuda_swab32(HIWORD(x)), cuda_swab32(LOWORD(x)));
+	return temp[0] | (temp[1]<<32);
 }
+#else
+	/* host */
+	#define cuda_swab64(x) \
+		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
+			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
+			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
+			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
+			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
+			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
+			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
+			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+#endif

 // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
 #if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offset) {
+__device__ __forceinline__ uint64_t ROTR64(const uint64_t value, const int offset) {
 	uint2 result;
 	if(offset < 32) {
 		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
@ -70,7 +132,7 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse
				@@ -70,7 +132,7 @@ __forceinline__ __device__ uint64_t ROTR64(const uint64_t value, const int offse

 // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
 #if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
+__device__ __forceinline__ uint64_t ROTL64(const uint64_t value, const int offset) {
 	uint2 result;
 	if(offset >= 32) {
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@ -1,23 +1,16 @@
				@@ -1,23 +1,16 @@
 // Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice

-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

+#include "cuda_helper.h"
+
 // aus cpu-miner.c
 extern int device_map[8];

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];

--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -1,4 +1,3 @@
				@@ -1,4 +1,3 @@
-
 extern "C"
 {
 #include "sph/sph_blake.h"
@ -7,10 +6,9 @@ extern "C"
				@@ -7,10 +6,9 @@ extern "C"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "miner.h"
+#include "cuda_helper.h"
 }

-#include <stdint.h>
-
 // aus cpu-miner.c
 extern int device_map[8];

@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
				@@ -33,12 +31,12 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void quark_skein512_cpu_init(int thr_id, int threads);
 extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 // Original nist5hash Funktion aus einem miner Quelltext
-inline void nist5hash(void *state, const void *input)
+extern "C" void nist5hash(void *state, const void *input)
 {
    sph_blake512_context ctx_blake;
    sph_groestl512_context ctx_groestl;
@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
				@@ -104,7 +102,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}

@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
				@@ -113,28 +111,20 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;

-		// erstes Blake512 Hash mit CUDA
+		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Groestl512
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für JH512
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Keccak512
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-
-		// das ist der unbedingte Branch für Skein512
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/groestl_functions_quad.cu
+++ b/groestl_functions_quad.cu
@ -1,3 +1,4 @@
				@@ -1,3 +1,4 @@
+#include "cuda_helper.h"

 __device__ __forceinline__ void G256_Mul2(uint32_t *regs)
 {
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@ -1,14 +1,7 @@
				@@ -1,14 +1,7 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *d_heftyHashes[8];
@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
				@@ -20,7 +13,6 @@ uint32_t *d_hash5output[8];
 // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)

-#include "cuda_helper.h"

 // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------

@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
				@@ -46,21 +38,9 @@ const uint8_t host_sigma[16][16] =
  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };

-// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
-#define SWAP32(x) \
-    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
-      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
-
-// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
-#define SWAP64(x) \
-    ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
-                (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
-                (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
-                (((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
-                (((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
-                (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
-                (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
-                (((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
+/* in cuda_helper */
+#define SWAP32(x) cuda_swab32(x)
+#define SWAP64(x) cuda_swab64(x)

 __constant__ uint64_t c_SecondRound[15];

--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@ -1,9 +1,4 @@
				@@ -1,9 +1,4 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
-// Folgende Definitionen später durch header ersetzen
-typedef unsigned int uint32_t;
+#include "cuda_helper.h"

 // globaler Speicher für unsere Ergebnisse
 uint32_t *d_hashoutput[8];
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@ -1,14 +1,7 @@
				@@ -1,14 +1,7 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *d_heftyHashes[8];
@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
				@@ -802,7 +795,6 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
 	cudaMemcpyToSymbol(	groestl_gpu_msg,
 						msgBlock,
 						128);
-	
 	BLOCKSIZE = len;
 }

--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@ -1,10 +1,9 @@
				@@ -1,10 +1,9 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

+#include "cuda_helper.h"
+#include <device_functions.h>
+
 #define USE_SHARED 1

 // aus cpu-miner.c
@ -13,11 +12,6 @@ extern int device_map[8];
				@@ -13,11 +12,6 @@ extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned int uint32_t;
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];

--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@ -1,14 +1,7 @@
				@@ -1,14 +1,7 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *d_heftyHashes[8];
@ -82,7 +75,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
				@@ -82,7 +75,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const

    /* absorb input */
    #pragma unroll 9
-    for (i = 0; i < 72 / 8; i++, in += 2)
+    for (i = 0; i < 9 /* 72/8 */; i++, in += 2)
        s[i] ^= U32TO64_LE(in);
    
    for (i = 0; i < 24; i++) {
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@ -1,12 +1,7 @@
				@@ -1,12 +1,7 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned int uint32_t;
+#include "cuda_helper.h"

 // globaler Speicher für alle HeftyHashes aller Threads
 extern uint32_t *d_heftyHashes[8];
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@ -1,7 +1,3 @@
				@@ -1,7 +1,3 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>
 #include <string.h>
@ -34,6 +30,8 @@
				@@ -34,6 +30,8 @@
 #include "heavy/cuda_blake512.h"
 #include "heavy/cuda_combine.h"

+#include "cuda_helper.h"
+
 extern uint32_t *d_hash2output[8];
 extern uint32_t *d_hash3output[8];
 extern uint32_t *d_hash4output[8];
--- a/miner.h
+++ b/miner.h
@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
				@@ -355,6 +355,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
+void nist5hash(void *state, const void *input);
 void quarkhash(void *state, const void *input);
 void x11hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -1,4 +1,3 @@
				@@ -1,4 +1,3 @@
-
 extern "C"
 {
 #include "sph/sph_blake.h"
@ -8,10 +7,9 @@ extern "C"
				@@ -8,10 +7,9 @@ extern "C"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "miner.h"
+#include "cuda_helper.h"
 }

-#include <stdint.h>
-
 // aus cpu-miner.c
 extern int device_map[8];

@ -45,9 +43,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
				@@ -45,9 +43,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void quark_jh512_cpu_init(int thr_id, int threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -189,18 +187,21 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
				@@ -189,18 +187,21 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,

 		// Konstanten kopieren, Speicher belegen
 		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		quark_compactTest_cpu_init(thr_id, throughput);
+
 		cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
+
 		init[thr_id] = true;
 	}

@ -209,7 +210,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
				@@ -209,7 +210,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_bmw512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -265,7 +266,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
				@@ -265,7 +266,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@ -1,140 +1,9 @@
				@@ -1,140 +1,9 @@
 #if 1

-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-
-// Endian Drehung für 32 Bit Typen
-/*
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-    return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
-          | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
-}
-*/
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-// das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
-}
-
-#if 0
-// Endian Drehung für 64 Bit Typen
-static __device__ unsigned long long cuda_swab64(unsigned long long x) {
-    uint32_t h = (x >> 32);
-    uint32_t l = (x & 0xFFFFFFFFULL);
-    return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
-}
-
-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const unsigned long long &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const unsigned long long &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-
-static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
-{
-#if __CUDA_ARCH__ >= 130
-    return __double_as_longlong(__hiloint2double(HI, LO));
-#else
-	return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
-#endif
-}
-
-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
-}
-#endif
-
-// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
-#if 1
-typedef unsigned long long uint64_t;
-#else
-typedef class uint64
-{
-public:
-	__device__ uint64()
-	{
-	}
-	__device__ uint64(unsigned long long init)
-	{
-		val = make_uint2( LOWORD(init), HIWORD(init) );
-	}
-	__device__ uint64(uint32_t lo, uint32_t hi)
-	{
-		val = make_uint2( lo, hi );
-	}
-	__device__ const uint64 operator^(uint64 const& rhs) const
-	{
-		return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
-	}
-	__device__ const uint64 operator|(uint64 const& rhs) const
-	{
-		return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
-	}
-	__device__ const uint64 operator+(unsigned long long const& rhs) const
-	{
-		return *this+uint64(rhs);
-	}
-	__device__ const uint64 operator+(uint64 const& rhs) const
-	{
-		uint64 res;
-		asm ("add.cc.u32      %0, %2, %4;\n\t"
-			 "addc.cc.u32     %1, %3, %5;\n\t"
-			 : "=r"(res.val.x), "=r"(res.val.y)
-			 : "r"(    val.x), "r"(    val.y),
-			   "r"(rhs.val.x), "r"(rhs.val.y));
-		return res;
-	}
-	__device__ const uint64 operator-(uint64 const& rhs) const
-	{
-		uint64 res;
-		asm ("sub.cc.u32      %0, %2, %4;\n\t"
-			 "subc.cc.u32     %1, %3, %5;\n\t"
-			 : "=r"(res.val.x), "=r"(res.val.y)
-			 : "r"(    val.x), "r"(    val.y),
-			   "r"(rhs.val.x), "r"(rhs.val.y));
-		return res;
-	}
-	__device__ const uint64 operator<<(int n) const
-	{
-		return uint64(unsigned long long(*this)<<n);
-	}
-	__device__ const uint64 operator>>(int n) const
-	{
-		return uint64(unsigned long long(*this)>>n);
-	}
-	__device__ operator unsigned long long() const
-	{
-		return MAKE_ULONGLONG(val.x, val.y);
-	}
-	uint2 val;
-} uint64_t;
-#endif
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -142,27 +11,9 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 // die Message it Padding zur Berechnung auf der GPU
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)

-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
 #define SHL(x, n)            ((x) << (n))
 #define SHR(x, n)            ((x) >> (n))

--- a/quark/cuda_checkhash.cu
+++ b/quark/cuda_checkhash.cu
@ -1,11 +1,8 @@
				@@ -1,11 +1,8 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
-#include <stdint.h>
 #include <memory.h>

+#include "cuda_helper.h"
+
 // Hash Target gegen das wir testen sollen
 __constant__ uint32_t pTarget[8];

@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
				@@ -58,20 +55,20 @@ __global__ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32
 }

 // Setup-Funktionen
-__host__ void quark_check_cpu_init(int thr_id, int threads)
+__host__ void cuda_check_cpu_init(int thr_id, int threads)
 {
    cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t));
    cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t));
 }

 // Target Difficulty setzen
-__host__ void quark_check_cpu_setTarget(const void *ptarget)
+__host__ void cuda_check_cpu_setTarget(const void *ptarget)
 {
 	// die Message zur Berechnung auf der GPU
 	cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }

-__host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+__host__ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-#include <stdint.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -1,16 +1,11 @@
				@@ -1,16 +1,11 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-#define USE_SHUFFLE 0
+#include "cuda_helper.h"

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#define ROTR(x,n) ROTR64(x,n)
+
+#define USE_SHUFFLE 0

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
				@@ -42,49 +37,8 @@ const uint8_t host_sigma[16][16] =
  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };

-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-#if 0
-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
-}
-#endif
-
-__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
-{
-	// Input:	77665544 33221100
-	// Output:	00112233 44556677
-	uint64_t temp[2];
-	temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
-	temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
-
-	return temp[0] | (temp[1]<<32);
-}
-
-__constant__ uint64_t c_u512[16];
-
-const uint64_t host_u512[16] =
+__device__ __constant__
+const uint64_t c_u512[16] =
 {
  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 
  0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
				@@ -96,24 +50,6 @@ const uint64_t host_u512[16] =
  0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };

-
-// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset < 32) {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTR(x, n)        (((x) >> (n)) | ((x) << (64 - (n))))
-#endif
-
 #define G(a,b,c,d,e)          \
    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
    v[d] = ROTR( v[d] ^ v[a],32);        \
@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
				@@ -125,14 +61,14 @@ __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset)
    v[b] = ROTR( v[b] ^ v[c],11);


-__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
+__device__ static
+void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
 {
    uint64_t v[16], m[16], i;

 #pragma unroll 16
-    for( i = 0; i < 16; ++i )
-    {
-        m[i] = SWAP64(block[i]);
+    for( i = 0; i < 16; ++i ) {
+        m[i] = cuda_swab64(block[i]);
    }

 #pragma unroll 8
@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
				@@ -169,24 +105,8 @@ __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, con
    for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
 }

-// Endian Drehung für 32 Bit Typen
-
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-
-/*
-// Endian Drehung für 64 Bit Typen
-static __device__ uint64_t cuda_swab64(uint64_t x) {
-    uint32_t h = (x >> 32);
-    uint32_t l = (x & 0xFFFFFFFFULL);
-    return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
-}
-*/
-
-static __constant__ uint64_t d_constMem[8];
-static const uint64_t h_constMem[8] = {
+__device__ __constant__
+static const uint64_t d_constMem[8] = {
 	0x6a09e667f3bcc908ULL,
 	0xbb67ae8584caa73bULL,
 	0x3c6ef372fe94f82bULL,
@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
				@@ -197,8 +117,8 @@ static const uint64_t h_constMem[8] = {
 	0x5be0cd19137e2179ULL };

 // Hash-Padding
-static __constant__ uint64_t d_constHashPadding[8];
-static const uint64_t h_constHashPadding[8] = {
+__device__ __constant__
+static const uint64_t d_constHashPadding[8] = {
 	0x0000000000000080ull,
 	0,
 	0,
@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
				@@ -208,7 +128,8 @@ static const uint64_t h_constHashPadding[8] = {
 	0,
 	0x0002000000000000ull };

-__global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
+__global__ __launch_bounds__(256, 4)
+void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);

@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
				@@ -224,70 +145,49 @@ __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads
 	if (thread < threads)
 #endif
 	{
+		uint8_t i;
 		// bestimme den aktuellen Zähler
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

 		int hashPosition = nounce - startNounce;
-		//uint64_t *inpHash = &g_hash[8 * hashPosition];
-		uint64_t *inpHash = &g_hash[hashPosition<<3];
+		uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
+
+		// 128 Byte für die Message
+		uint64_t buf[16];

 		// State vorbereiten
 		uint64_t h[8];
-		/*
-		h[0] = 0x6a09e667f3bcc908ULL;
-		h[1] = 0xbb67ae8584caa73bULL;
-		h[2] = 0x3c6ef372fe94f82bULL;
-		h[3] = 0xa54ff53a5f1d36f1ULL;
-		h[4] = 0x510e527fade682d1ULL;
-		h[5] = 0x9b05688c2b3e6c1fULL;
-		h[6] = 0x1f83d9abfb41bd6bULL;
-		h[7] = 0x5be0cd19137e2179ULL;
-		*/
 		#pragma unroll 8
-		for(int i=0;i<8;i++)
+		for (i=0;i<8;i++)
 			h[i] = d_constMem[i];

-		// 128 Byte für die Message
-		uint64_t buf[16];
-
 		// Message für die erste Runde in Register holen
 		#pragma unroll 8
-		for (int i=0; i < 8; ++i) buf[i] = inpHash[i];
-
-		/*
-		buf[ 8] = 0x0000000000000080ull;
-		buf[ 9] = 0;
-		buf[10] = 0;
-		buf[11] = 0;
-		buf[12] = 0;
-		buf[13] = 0x0100000000000000ull;
-		buf[14] = 0;
-		buf[15] = 0x0002000000000000ull;
-		*/
+		for (i=0; i < 8; ++i)
+			buf[i] = inpHash[i];
+
 		#pragma unroll 8
-		for(int i=0;i<8;i++)
+		for (i=0; i < 8; i++)
 			buf[i+8] = d_constHashPadding[i];

 		// die einzige Hashing-Runde
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );

-		// Hash rauslassen
 #if __CUDA_ARCH__ >= 130
 		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
 		uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
 		#pragma unroll 8
-		for (int i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
+		for (i=0; i < 8; ++i) {
+			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
 		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = &g_hash[8 * hashPosition];
 		#pragma unroll 8
-		for (int i=0; i < 8; ++i)
+		for (i=0; i < 8; ++i)
 		{
-			//outHash[i] = cuda_swab64( h[i] );
-			outHash[i] = SWAP64(h[i]);
+			outHash[i] = cuda_swab64(h[i]);
 		}
 #endif
 	}
@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
				@@ -298,30 +198,21 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+		// State vorbereiten
+		uint64_t h[8];
+		// 128 Byte für die Message
+		uint64_t buf[16];
+		uint8_t i;
 		// bestimme den aktuellen Zähler
 		uint32_t nounce = startNounce + thread;

-		// State vorbereiten
-		uint64_t h[8];
-		/*
-		h[0] = 0x6a09e667f3bcc908ULL;
-		h[1] = 0xbb67ae8584caa73bULL;
-		h[2] = 0x3c6ef372fe94f82bULL;
-		h[3] = 0xa54ff53a5f1d36f1ULL;
-		h[4] = 0x510e527fade682d1ULL;
-		h[5] = 0x9b05688c2b3e6c1fULL;
-		h[6] = 0x1f83d9abfb41bd6bULL;
-		h[7] = 0x5be0cd19137e2179ULL;
-		*/
 		#pragma unroll 8
-		for(int i=0;i<8;i++)
+		for(i=0;i<8;i++)
 			h[i] = d_constMem[i];
-		// 128 Byte für die Message
-		uint64_t buf[16];

 		// Message für die erste Runde in Register holen
 		#pragma unroll 16
-		for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
+		for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];

 		// die Nounce durch die thread-spezifische ersetzen
 		buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
@ -334,18 +225,16 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
				@@ -334,18 +225,16 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
 		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
 		#pragma unroll 8
-		for (int i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
+		for (i=0; i < 8; ++i) {
+			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
 		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
 		#pragma unroll 8
-		for (int i=0; i < 8; ++i)
-		{
-			//outHash[i] = cuda_swab64( h[i] );
-			outHash[i] = SWAP64(h[i]);
+		for (i=0; i < 8; ++i) {
+			outHash[i] = cuda_swab64( h[i] );
 		}
 #endif
 	}
@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
				@@ -362,21 +251,6 @@ __host__ void quark_blake512_cpu_init(int thr_id, int threads)
 						host_sigma,
 						sizeof(host_sigma),
 						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( c_u512,
-						host_u512,
-						sizeof(host_u512),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_constMem,
-						h_constMem,
-						sizeof(h_constMem),
-						0, cudaMemcpyHostToDevice);
-
-	cudaMemcpyToSymbol( d_constHashPadding,
-						h_constHashPadding,
-						sizeof(h_constHashPadding),
-						0, cudaMemcpyHostToDevice);
 }

 // Blake512 für 80 Byte grosse Eingangsdaten
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@ -1,11 +1,8 @@
				@@ -1,11 +1,8 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-#include "sm_30_intrinsics.h"
-
 #include <stdio.h>
 #include <memory.h>
-#include <stdint.h>
+
+#include "cuda_helper.h"
+#include <sm_30_intrinsics.h>

 // aus cpu-miner.c
 extern int device_map[8];
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -1,23 +1,16 @@
				@@ -1,23 +1,16 @@
 // Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice

-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

+#include "cuda_helper.h"
+
 // aus cpu-miner.c
 extern int device_map[8];

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];

@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
				@@ -25,8 +18,8 @@ static cudaDeviceProp props[8];
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"

-__global__ void __launch_bounds__(256, 4)
- quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
+__global__ __launch_bounds__(256, 4)
+void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
    int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@ -1,27 +1,19 @@
				@@ -1,27 +1,19 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-#include "cuda_helper.h"
-
 #define U32TO64_LE(p) \
    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))

 #define U64TO32_LE(p, v) \
    *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);

-static const uint64_t host_keccak_round_constants[24] = {
+__device__ __constant__
+static const uint64_t c_keccak_round_constants[24] = {
    0x0000000000000001ull, 0x0000000000008082ull,
    0x800000000000808aull, 0x8000000080008000ull,
    0x000000000000808bull, 0x0000000080000001ull,
@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
				@@ -36,8 +28,6 @@ static const uint64_t host_keccak_round_constants[24] = {
    0x0000000080000001ull, 0x8000000080008008ull
 };

-__constant__ uint64_t c_keccak_round_constants[24];
-
 static __device__ __forceinline__ void
 keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
    size_t i;
@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
				@@ -157,11 +147,6 @@ __global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, u
 // Setup-Funktionen
 __host__ void quark_keccak512_cpu_init(int thr_id, int threads)
 {
-    // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol( c_keccak_round_constants,
-                        host_keccak_round_constants,
-                        sizeof(host_keccak_round_constants),
-                        0, cudaMemcpyHostToDevice);
 }

 __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@ -1,16 +1,8 @@
				@@ -1,16 +1,8 @@
-#include <cuda.h>
-#include "cuda_runtime.h"
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
+#include <stdint.h>
 #include <memory.h>

-// Folgende Definitionen später durch header ersetzen
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
+#include "cuda_helper.h"

 // aus cpu-miner.c
 extern "C" extern int device_map[8];
@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -19,21 +11,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t

 // Take a look at: https://www.schneier.com/skein1.3.pdf

-#if __CUDA_ARCH__ >= 350
-__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
-    uint2 result;
-    if(offset >= 32) {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-    } else {
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
-        asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
-    }
-    return  __double_as_longlong(__hiloint2double(result.y, result.x));
-}
-#else
-#define ROTL64(x, n)        (((x) << (n)) | ((x) >> (64 - (n))))
-#endif
 #define SHL(x, n)			((x) << (n))
 #define SHR(x, n)			((x) >> (n))

--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -1,4 +1,3 @@
				@@ -1,4 +1,3 @@
-
 extern "C"
 {
 #include "sph/sph_blake.h"
@ -8,9 +7,9 @@ extern "C"
				@@ -8,9 +7,9 @@ extern "C"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "miner.h"
-}

-#include <stdint.h>
+#include "cuda_helper.h"
+}

 // aus cpu-miner.c
 extern int device_map[8];
@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
				@@ -45,9 +44,9 @@ extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startN
 extern void quark_jh512_cpu_init(int thr_id, int threads);
 extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
				@@ -171,18 +170,21 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,

 		// Konstanten kopieren, Speicher belegen
 		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		quark_compactTest_cpu_init(thr_id, throughput);
+
 		cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
 		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
+
 		init[thr_id] = true;
 	}

@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
				@@ -191,7 +193,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
				@@ -247,7 +249,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/util.c
+++ b/util.c
@ -1352,6 +1352,10 @@ void print_hash_tests(void)
				@@ -1352,6 +1352,10 @@ void print_hash_tests(void)
 	myriadhash(&hash[0], &buf[0]);
 	printf("\nmyriad:  "); print_hash(hash);

+	memset(hash, 0, sizeof hash);
+	nist5hash(&hash[0], &buf[0]);
+	printf("\nnist5:   "); print_hash(hash);
+
 	memset(hash, 0, sizeof hash);
 	quarkhash(&hash[0], &buf[0]);
 	printf("\nquark:   "); print_hash(hash);
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@ -1,30 +1,13 @@
				@@ -1,30 +1,13 @@
-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

 typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
-
-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
-#if 0
-__device__ static uint32_t cuda_swab32(uint32_t x)
-{
-	return __byte_perm(x, 0, 0x0123);
-}
-#endif
-
-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;

 #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
 #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */

-typedef unsigned int uint32_t; /* must be exactly 32 bits */
-
 #define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
 #define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
 #define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@ -1,33 +1,7 @@
				@@ -1,33 +1,7 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "device_launch_parameters.h"
-
 #include <stdio.h>
-#include <stdint.h>
 #include <memory.h>

-// das Hi Word aus einem 64 Bit Typen extrahieren
-#if 0
-static __device__ uint32_t HIWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-#endif
-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@ -18,28 +18,18 @@
				@@ -18,28 +18,18 @@
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

 typedef unsigned char BitSequence;

-typedef unsigned char uint8_t;
-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
 typedef struct {
    uint32_t buffer[8]; /* Buffer to be hashed */
    uint32_t chainv[40];   /* Chaining values */
 } hashState;

- __device__ __forceinline__
-static uint32_t BYTES_SWAP32(uint32_t x)
-{
-	return __byte_perm(x, x, 0x0123);
-}
-
 #define MULT2(a,j)\
    tmp = a[7+(8*j)];\
    a[7+(8*j)] = a[6+(8*j)];\
@ -289,11 +279,11 @@ __device__ __forceinline__
				@@ -289,11 +279,11 @@ __device__ __forceinline__
 void Update512(hashState *state, const BitSequence *data)
 {
 #pragma unroll 8
-    for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
+    for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)data)[i]);
    rnd512(state);

 #pragma unroll 8
-    for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
+    for(int i=0;i<8;i++) state->buffer[i] = cuda_swab32(((uint32_t*)(data+32))[i]);
    rnd512(state);
 }

@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
				@@ -321,7 +311,7 @@ void finalization512(hashState *state, uint32_t *b)
        for(j=0;j<5;j++) {
            b[i] ^= state->chainv[i+8*j];
        }
-        b[i] = BYTES_SWAP32((b[i]));
+        b[i] = cuda_swab32((b[i]));
    }

 #pragma unroll 8
@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
				@@ -335,7 +325,7 @@ void finalization512(hashState *state, uint32_t *b)
        for(j=0;j<5;j++) {
            b[8+i] ^= state->chainv[i+8*j];
        }
-        b[8+i] = BYTES_SWAP32((b[8+i]));
+        b[8 + i] = cuda_swab32((b[8 + i]));
    }
 }

--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@ -1,18 +1,13 @@
				@@ -1,18 +1,13 @@
-#include <stdint.h>
-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-typedef unsigned char BitSequence;
-typedef unsigned long long DataLength;
+//typedef unsigned char BitSequence;
+//typedef unsigned long long DataLength;

-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-static __constant__ uint32_t d_ShaviteInitVector[16];
-static const uint32_t h_ShaviteInitVector[] = {
+__device__ __constant__
+static const uint32_t d_ShaviteInitVector[16] = {
 	SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC),
 	SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC),
 	SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47),
@ -1352,11 +1347,6 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -1352,11 +1347,6 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
 {
 	aes_cpu_init();
-
-	cudaMemcpyToSymbol( d_ShaviteInitVector,
-                        h_ShaviteInitVector,
-                        sizeof(h_ShaviteInitVector),
-                        0, cudaMemcpyHostToDevice);
 }

 __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
@ -1373,4 +1363,3 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
				@@ -1373,4 +1363,3 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
-
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@ -7,29 +7,17 @@
				@@ -7,29 +7,17 @@

 #define TPB 256

+#include "cuda_helper.h"
+
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-typedef unsigned int uint32_t;
-typedef unsigned long long uint64_t;
-
 int *d_state[8];
 uint4 *d_temp4[8];

 // texture bound to d_temp4[thr_id], for read access in Compaction kernel
 texture<uint4, 1, cudaReadModeElementType> texRef1D_128;

-#define C32(x)    ((uint32_t)(x ## U))
-#define T32(x) ((x) & C32(0xFFFFFFFF))
-
-#if __CUDA_ARCH__ < 350 
-    // Kepler (Compute 3.0)
-    #define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-    // Kepler (Compute 3.5)
-    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
-
 __device__ __constant__
 const uint32_t c_IV_512[32] = {
  0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
				@@ -166,7 +154,7 @@ X(j) = (u-v) << (2*n); \
 #undef BUTTERFLY
 }

-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 /**
 * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
 * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \
				@@ -177,7 +165,7 @@ X(j) = (u-v) << (2*n); \

 __device__ __forceinline__ void FFT_16(int *y) {

-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 #ifndef WIN32
 # warning FFT_16() function is not compatible with SM 2.1 devices!
 #endif
@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
				@@ -346,7 +334,7 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
 __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
 {
  int i;
-#if __CUDA_ARCH__ < 300
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 300
 #ifndef WIN32
 # warning Expansion() function is not compatible with SM 2.1 devices
 #endif
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -15,10 +15,11 @@ extern "C"
				@@ -15,10 +15,11 @@ extern "C"
 #include "sph/sph_echo.h"

 #include "miner.h"
-}
+#include "cuda_helper.h"

-#include <stdint.h>
-#include <cuda_helper.h>
+#include <stdio.h>
+#include <memory.h>
+}

 // aus cpu-miner.c
 extern int device_map[8];
@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
				@@ -62,9 +63,9 @@ extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounc
 extern void x11_echo512_cpu_init(int thr_id, int threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, 
@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
				@@ -172,7 +173,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);

 		init[thr_id] = true;
 	}
@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
				@@ -182,7 +183,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		uint32_t foundNonce;
@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
				@@ -202,7 +203,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x13/cuda_x13_fugue512.cu
+++ b/x13/cuda_x13_fugue512.cu
@ -5,26 +5,11 @@
				@@ -5,26 +5,11 @@
 * heavily based on phm's sgminer
 *
 */
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "device_launch_parameters.h"
-
-#include <stdint.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#if __CUDA_ARCH__ < 350
-// Kepler (Compute 3.0)
-#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-// Kepler (Compute 3.5, 5.0)
-#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
-
 /*
 * X13 kernel implementation.
 *
@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -56,8 +41,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 * @author   phm <phm@inbox.com>
 */

-#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
-
 #define mixtab0(x) (*((uint32_t*)mixtabs + (    (x))))
 #define mixtab1(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define mixtab2(x) (*((uint32_t*)mixtabs + (512+(x))))
@ -595,7 +578,7 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
				@@ -595,7 +578,7 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint

 		#pragma unroll 16
 		for( i = 0; i < 16; i++ )
-            Hash[i] = SWAB32(Hash[i]);
+			Hash[i] = cuda_swab32(Hash[i]);

 		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09;
 		uint32_t S10, S11, S12, S13, S14, S15, S16, S17, S18, S19;
@ -658,22 +641,22 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
				@@ -658,22 +641,22 @@ __global__ void x13_fugue512_gpu_hash_64(int threads, uint32_t startNounce, uint
 		S18 ^= S00;
 		S27 ^= S00;

-        Hash[0] = SWAB32(S01);
-        Hash[1] = SWAB32(S02);
-        Hash[2] = SWAB32(S03);
-        Hash[3] = SWAB32(S04);
-        Hash[4] = SWAB32(S09);
-        Hash[5] = SWAB32(S10);
-        Hash[6] = SWAB32(S11);
-        Hash[7] = SWAB32(S12);
-        Hash[8] = SWAB32(S18);
-        Hash[9] = SWAB32(S19);
-        Hash[10] = SWAB32(S20);
-        Hash[11] = SWAB32(S21);
-        Hash[12] = SWAB32(S27);
-        Hash[13] = SWAB32(S28);
-        Hash[14] = SWAB32(S29);
-        Hash[15] = SWAB32(S30);
+		Hash[0] = cuda_swab32(S01);
+		Hash[1] = cuda_swab32(S02);
+		Hash[2] = cuda_swab32(S03);
+		Hash[3] = cuda_swab32(S04);
+		Hash[4] = cuda_swab32(S09);
+		Hash[5] = cuda_swab32(S10);
+		Hash[6] = cuda_swab32(S11);
+		Hash[7] = cuda_swab32(S12);
+		Hash[8] = cuda_swab32(S18);
+		Hash[9] = cuda_swab32(S19);
+		Hash[10] = cuda_swab32(S20);
+		Hash[11] = cuda_swab32(S21);
+		Hash[12] = cuda_swab32(S27);
+		Hash[13] = cuda_swab32(S28);
+		Hash[14] = cuda_swab32(S29);
+		Hash[15] = cuda_swab32(S30);
 	}
 }

--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@ -37,26 +37,11 @@
				@@ -37,26 +37,11 @@
 * @author   phm <phm@inbox.com>
 */

-#include <stdint.h>
-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
-
-#if __CUDA_ARCH__ < 350
-    // Kepler (Compute 3.0)
-    #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-    // Kepler (Compute 3.5)
-    #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
-
 __device__ __constant__
 static const uint32_t d_alpha_n[] = {
 	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
@ -716,11 +701,13 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint
				@@ -716,11 +701,13 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint
 			P_BIG;
 			T_BIG;
 		}
+
 #undef buf
 #define buf(u) (u == 0 ? 0x80 : 0)
 		INPUT_BIG;
 		P_BIG;
 		T_BIG;
+
 #undef buf
 #define buf(u) (u == 6 ? 2 : 0)
 		INPUT_BIG;
@ -729,7 +716,7 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint
				@@ -729,7 +716,7 @@ __global__ void x13_hamsi512_gpu_hash_64(int threads, uint32_t startNounce, uint

 		#pragma unroll 16
 		for (int i = 0; i < 16; i++)
-            Hash[i] = SWAB32(h[i]);
+			Hash[i] = cuda_swab32(h[i]);
 	}
 }

@ -754,4 +741,3 @@ __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
				@@ -754,4 +741,3 @@ __host__ void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
 	x13_hamsi512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
-
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -20,10 +20,9 @@ extern "C"
				@@ -20,10 +20,9 @@ extern "C"
 #include "sph/sph_fugue.h"

 #include "miner.h"
-}

-#include <stdint.h>
-#include <cuda_helper.h>
+#include "cuda_helper.h"
+}

 // aus cpu-miner.c
 extern int device_map[8];
@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
				@@ -73,9 +72,9 @@ extern void x13_hamsi512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
 extern void x13_fugue512_cpu_init(int thr_id, int threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, 
@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
				@@ -194,7 +193,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x11_echo512_cpu_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);

 		init[thr_id] = true;
 	}
@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
				@@ -204,7 +203,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		uint32_t foundNonce;
@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
				@@ -225,7 +224,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x15/cuda_x14_shabal512.cu
+++ b/x15/cuda_x14_shabal512.cu
@ -1,26 +1,10 @@
				@@ -1,26 +1,10 @@
 /*
 * Shabal-512 for X14/X15 (STUB)
 */
-#include <stdint.h>
-#include <cuda_runtime.h>
+#include "cuda_helper.h"

 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-
-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-#define SPH_C32(x)    ((uint32_t)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define SWAB32(x) ( __byte_perm(x, x, 0x0123) )
-
-#if __CUDA_ARCH__ < 350
-	// Kepler (Compute 3.0)
-	#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
-#else
-	// Kepler (Compute 3.5)
-	#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
-#endif
-
 /* $Id: shabal.c 175 2010-05-07 16:03:20Z tp $ */
 /*
 * Shabal implementation.
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@ -4,8 +4,8 @@
				@@ -4,8 +4,8 @@
 * tpruvot@github
 */
 #include <stdio.h>
-#include <stdint.h>
-#include <cuda_helper.h>
+
+#include "cuda_helper.h"

 #define NULLTEST 0

@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
				@@ -14,8 +14,6 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 #define SPH_64 (1)
 #define SPH_SMALL_FOOTPRINT_WHIRLPOOL (1)

-#define SPH_C64(x)    ((uint64_t)(x ## ULL))
-
 // defined in cuda_helper.h
 #define SPH_ROTL64(x,n) ROTL64(x,n)

--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -22,10 +22,9 @@ extern "C" {
				@@ -22,10 +22,9 @@ extern "C" {
 #include "sph/sph_shabal.h"

 #include "miner.h"
-}

-#include <stdint.h>
-#include <cuda_helper.h>
+#include "cuda_helper.h"
+}

 // from cpu-miner.c
 extern int device_map[8];
@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
				@@ -77,9 +76,9 @@ extern void x13_fugue512_cpu_hash_64(int thr_id, int threads, uint32_t startNoun
 extern void x14_shabal512_cpu_init(int thr_id, int threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
				@@ -203,7 +202,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);

-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}

@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
				@@ -211,7 +210,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
				@@ -230,7 +229,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if (foundNonce != 0xffffffff)
 		{
 			/* check now with the CPU to confirm */
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -23,10 +23,9 @@ extern "C" {
				@@ -23,10 +23,9 @@ extern "C" {
 #include "sph/sph_whirlpool.h"

 #include "miner.h"
-}

-#include <stdint.h>
-#include <cuda_helper.h>
+#include "cuda_helper.h"
+}

 // to test gpu hash on a null buffer
 #define NULLTEST 0
@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
				@@ -84,9 +83,9 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
 extern void x15_whirlpool_cpu_init(int thr_id, int threads);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

-extern void quark_check_cpu_init(int thr_id, int threads);
-extern void quark_check_cpu_setTarget(const void *ptarget);
-extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);

 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
				@@ -231,7 +230,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x14_shabal512_cpu_init(thr_id, throughput);
 		x15_whirlpool_cpu_init(thr_id, throughput);

-		quark_check_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}

@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
				@@ -239,7 +238,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);

 	quark_blake512_cpu_setBlock_80((void*)endiandata);
-	quark_check_cpu_setTarget(ptarget);
+	cuda_check_cpu_setTarget(ptarget);

 	do {
 		int order = 0;
@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
				@@ -266,7 +265,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		print_hash((unsigned char*)buf); printf("\n");
 #endif
 		/* Scan with GPU */
-		uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		if (foundNonce != 0xffffffff)
 		{