From 1b65cd05cc7e9bf5fa9a6ac10af3ea4c1d6285ba Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 27 Nov 2014 01:57:12 +0100
Subject: [PATCH] heavy: add error checks, fix strict aliasing and linux

The core problem was the cuda hefty Thread per block set to high
but took me several hours to find that...

btw... +25% in heavy 12500 with 256 threads per block... vs 128 & 512
if max reg count is set to 80...
---
 Makefile.am              |   3 +
 blake32.cu               |  10 ++-
 ccminer.vcxproj          |  10 +--
 ccminer.vcxproj.filters  |  24 +------
 heavy/cuda_blake512.cu   |  46 ++++++------
 heavy/cuda_blake512.h    |   7 --
 heavy/cuda_combine.cu    |  34 ++++-----
 heavy/cuda_combine.h     |   7 --
 heavy/cuda_groestl512.cu |  96 ++++++++++++-------------
 heavy/cuda_groestl512.h  |   9 ---
 heavy/cuda_hefty1.cu     | 139 ++++++++++++++++++-----------------
 heavy/cuda_hefty1.h      |   8 ---
 heavy/cuda_keccak512.cu  |  57 +++++++--------
 heavy/cuda_keccak512.h   |   9 ---
 heavy/cuda_sha256.cu     |  51 ++++++-------
 heavy/cuda_sha256.h      |   8 ---
 heavy/heavy.cu           | 152 ++++++++++++++++++---------------------
 heavy/heavy.h            |  30 ++++++++
 stats.cpp                |   2 +-
 util.cpp                 |  18 +++--
 20 files changed, 339 insertions(+), 381 deletions(-)
 delete mode 100644 heavy/cuda_blake512.h
 delete mode 100644 heavy/cuda_combine.h
 delete mode 100644 heavy/cuda_groestl512.h
 delete mode 100644 heavy/cuda_hefty1.h
 delete mode 100644 heavy/cuda_keccak512.h
 delete mode 100644 heavy/cuda_sha256.h
 create mode 100644 heavy/heavy.h
diff --git a/Makefile.am b/Makefile.am
index 816d835..d7c27b5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -80,6 +80,9 @@ nvcc_FLAGS += $(JANSSON_INCLUDES) --ptxas-options="-v"
 blake32.o: blake32.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=64 -o $@ -c $<
 
+heavy/cuda_hefty1.o: heavy/cuda_hefty1.cu
+	$(NVCC) $(nvcc_FLAGS) --maxrregcount=80 -o $@ -c $<
+
 keccak/cuda_keccak256.o: keccak/cuda_keccak256.cu
 	$(NVCC) $(nvcc_FLAGS) --maxrregcount=92 -o $@ -c $<
 
diff --git a/blake32.cu b/blake32.cu
index 75e656a..307d1ac 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -303,7 +303,11 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
 		ending[3] = nonce; /* our tested value */
 
 		blake256_compress(h, ending, 640, rounds);
-
+#if 0
+		if (trace) {
+			printf("blake hash[6][7]: %08x %08x\n", h[6], h[7]);
+		}
+#endif
 		//if (h[7] == 0 && high64 <= highTarget) {
 		if (h[7] == 0) {
 #if NBN == 2
@@ -318,14 +322,14 @@ void blake256_gpu_hash_16(const uint32_t threads, const uint32_t startNonce, uin
 #else
 			resNonce[0] = nonce;
 #endif
-			if (trace) {
 #ifdef _DEBUG
+			if (trace) {
 				uint64_t high64 = ((uint64_t*)h)[3];
 				printf("gpu:  %16llx\n", high64);
 				printf("gpu: %08x.%08x\n", h[7], h[6]);
 				printf("tgt:  %16llx\n", highTarget);
-#endif
 			}
+#endif
 		}
 	}
 }
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index e4ee1da..7392912 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -173,7 +173,7 @@
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>false</Keep>
-      <CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50</CodeGeneration>
       <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
       <Defines>
       </Defines>
@@ -306,12 +306,7 @@
     <ClInclude Include="cuda_groestlcoin.h" />
     <ClInclude Include="cuda_helper.h" />
     <ClInclude Include="elist.h" />
-    <ClInclude Include="heavy\cuda_blake512.h" />
-    <ClInclude Include="heavy\cuda_combine.h" />
-    <ClInclude Include="heavy\cuda_groestl512.h" />
-    <ClInclude Include="heavy\cuda_hefty1.h" />
-    <ClInclude Include="heavy\cuda_keccak512.h" />
-    <ClInclude Include="heavy\cuda_sha256.h" />
+    <ClInclude Include="heavy\heavy.h" />
     <ClInclude Include="hefty1.h" />
     <ClInclude Include="miner.h" />
     <ClInclude Include="nvml.h" />
@@ -358,6 +353,7 @@
     <CudaCompile Include="heavy\cuda_groestl512.cu">
     </CudaCompile>
     <CudaCompile Include="heavy\cuda_hefty1.cu">
+      <MaxRegCount>80</MaxRegCount>
     </CudaCompile>
     <CudaCompile Include="heavy\cuda_keccak512.cu">
     </CudaCompile>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index a50b1fa..c2bf060 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -43,9 +43,6 @@
     <Filter Include="Source Files\CUDA\heavy">
       <UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
     </Filter>
-    <Filter Include="Header Files\CUDA\heavy">
-      <UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
-    </Filter>
     <Filter Include="Source Files\CUDA\qubit">
       <UniqueIdentifier>{f3ed23a2-8ce7-41a5-b051-6da56047dc35}</UniqueIdentifier>
     </Filter>
@@ -293,23 +290,8 @@
     <ClInclude Include="sph\sph_types.h">
       <Filter>Header Files\sph</Filter>
     </ClInclude>
-    <ClInclude Include="heavy\cuda_blake512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_combine.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_groestl512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_hefty1.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_keccak512.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
-    </ClInclude>
-    <ClInclude Include="heavy\cuda_sha256.h">
-      <Filter>Header Files\CUDA\heavy</Filter>
+    <ClInclude Include="heavy\heavy.h">
+      <Filter>Header Files\CUDA</Filter>
     </ClInclude>
     <ClInclude Include="cuda_helper.h">
       <Filter>Header Files\CUDA</Filter>
@@ -539,4 +521,4 @@
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/heavy/cuda_blake512.cu b/heavy/cuda_blake512.cu
index fe58bc0..b177514 100644
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@@ -3,11 +3,11 @@
 
 #include "cuda_helper.h"
 
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher fÃ¼r alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
 
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher fÃ¼r unsere Ergebnisse
 uint32_t *d_hash5output[8];
 
 // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
@@ -53,13 +53,13 @@ __constant__ uint64_t c_u512[16];
 
 const uint64_t host_u512[16] =
 {
-  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 
+  0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
   0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
-  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 
+  0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
   0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
-  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 
+  0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
   0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
-  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 
+  0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
   0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };
 
@@ -123,7 +123,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// bestimme den aktuellen Zähler
+		// bestimme den aktuellen ZÃ¤hler
 		//uint32_t nounce = startNounce + thread;
 		uint32_t nounce = nonceVector[thread];
 
@@ -141,10 +141,10 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 		h[6] = 0x1f83d9abfb41bd6bULL;
 		h[7] = 0x5be0cd19137e2179ULL;
 
-		// 128 Byte für die Message
+		// 128 Byte fÃ¼r die Message
 		uint64_t buf[16];
 
-		// Message für die erste Runde in Register holen
+		// Message fÃ¼r die erste Runde in Register holen
 #pragma unroll 16
 		for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage[i];
 
@@ -154,7 +154,7 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 		uint32_t *hefty = heftyHashes + 8 * hashPosition;
 		if (BLOCKSIZE == 84) {
 			// den thread-spezifischen Hefty1 hash einsetzen
-			// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
+			// aufwÃ¤ndig, weil das nicht mit uint64_t WÃ¶rtern aligned ist.
 			buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
 			buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
 			buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
@@ -173,14 +173,14 @@ template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t
 
 		// erste Runde
 		blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );
-		
-		
+
+
 		// zweite Runde
 #pragma unroll 15
 		for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
 		buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
 		blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );
-		
+
 		// Hash rauslassen
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
 #pragma unroll 8
@@ -210,8 +210,8 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
 						sizeof(host_SecondRound),
 						0, cudaMemcpyHostToDevice);
 
-	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
+	// Speicher fÃ¼r alle Ergebnisse belegen
+	CUDA_SAFE_CALL(cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads));
 }
 
 static int BLOCKSIZE = 84;
@@ -222,14 +222,14 @@ __host__ void blake512_cpu_setBlock(void *pdata, int len)
 {
 	unsigned char PaddedMessage[128];
 	if (len == 84) {
-		// Message mit Padding für erste Runde bereitstellen
+		// Message mit Padding fÃ¼r erste Runde bereitstellen
 		memcpy(PaddedMessage, pdata, 84);
-		memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
+		memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfÃ¼llen
 		memset(PaddedMessage+116, 0, 12);
 		PaddedMessage[116] = 0x80;
 	} else if (len == 80) {
 		memcpy(PaddedMessage, pdata, 80);
-		memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
+		memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfÃ¼llen
 		memset(PaddedMessage+112, 0, 16);
 		PaddedMessage[112] = 0x80;
 	}
@@ -246,11 +246,11 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
+	// GrÃ¶ÃŸe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE == 80)
-		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 84)
-		blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 }
diff --git a/heavy/cuda_blake512.h b/heavy/cuda_blake512.h
deleted file mode 100644
index 7e24973..0000000
--- a/heavy/cuda_blake512.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CUDA_BLAKE512_H
-#define _CUDA_BLAKE512_H
-
-void blake512_cpu_init(int thr_id, int threads);
-void blake512_cpu_setBlock(void *pdata, int len);
-void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-#endif
diff --git a/heavy/cuda_combine.cu b/heavy/cuda_combine.cu
index 329c831..b0b2ead 100644
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@@ -1,16 +1,19 @@
-#include "cuda_helper.h"
+#include <stdio.h>
 
-// globaler Speicher für unsere Ergebnisse
-uint32_t *d_hashoutput[8];
+#include "cuda_helper.h"
 
+// globaler Speicher fÃ¼r unsere Ergebnisse
+static uint32_t *d_hashoutput[8];
 extern uint32_t *d_hash2output[8];
 extern uint32_t *d_hash3output[8];
 extern uint32_t *d_hash4output[8];
 extern uint32_t *d_hash5output[8];
-extern uint32_t *d_nonceVector[8];
+
+extern uint32_t *heavy_nonceVector[8];
 
 /* Combines top 64-bits from each hash into a single hash */
-static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
+__device__
+static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4)
 {
 	uint32_t lout[8]; // Combining in Registern machen
 
@@ -98,7 +101,8 @@ static void __device__ combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *
 		out[i] = lout[i];
 }
 
-__global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
+__global__
+void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *out, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4, uint32_t *hash5, uint32_t *nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -116,13 +120,14 @@ __global__ void combine_gpu_hash(int threads, uint32_t startNounce, uint32_t *ou
 	}
 }
 
-// Setup-Funktionen
-__host__ void combine_cpu_init(int thr_id, int threads)
+__host__
+void combine_cpu_init(int thr_id, int threads)
 {
-	// Speicher für alle Ergebnisse belegen
-	cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
+	// Speicher fÃ¼r alle Ergebnisse belegen
+	CUDA_SAFE_CALL(cudaMalloc(&d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads));
 }
 
+__host__
 void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash)
 {
 	// diese Kopien sind optional, da die Hashes jetzt bereits auf der GPU liegen sollten
@@ -133,11 +138,8 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
-	combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
+	combine_gpu_hash <<<grid, block>>> (threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], heavy_nonceVector[thr_id]);
 
-	// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
-	cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
+	// da die Hash Auswertung noch auf der CPU erfolgt, mÃ¼ssen die Ergebnisse auf jeden Fall zum Host kopiert werden
+	CUDA_SAFE_CALL(cudaMemcpy(hash, d_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost));
 }
diff --git a/heavy/cuda_combine.h b/heavy/cuda_combine.h
deleted file mode 100644
index 5bb5832..0000000
--- a/heavy/cuda_combine.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _CUDA_COMBINE_H
-#define _CUDA_COMBINE_H
-
-void combine_cpu_init(int thr_id, int threads);
-void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
-
-#endif
diff --git a/heavy/cuda_groestl512.cu b/heavy/cuda_groestl512.cu
index 764b072..889002a 100644
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@@ -3,11 +3,11 @@
 
 #include "cuda_helper.h"
 
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher fÃ¼r alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
 
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher fÃ¼r unsere Ergebnisse
 uint32_t *d_hash4output[8];
 
 __constant__ uint32_t groestl_gpu_state[32];
@@ -603,22 +603,22 @@ __device__ void groestl512_perm_P(uint32_t *a)
 #pragma unroll 16
 		for(int k=0;k<32;k+=2)
 		{
-			t[k + 0] =	T0up( B32_0(a[k & 0x1f]) ) ^ 
-						T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ 
-						T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ 
-						T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ 
-						T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ 
-						T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ 
-						T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ 
+			t[k + 0] =	T0up( B32_0(a[k & 0x1f]) ) ^
+						T1up( B32_1(a[(k + 2) & 0x1f]) ) ^
+						T2up( B32_2(a[(k + 4) & 0x1f]) ) ^
+						T3up( B32_3(a[(k + 6) & 0x1f]) ) ^
+						T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^
+						T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^
+						T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^
 						T3dn( B32_3(a[(k + 23) & 0x1f]) );
 
-			t[k + 1] =	T0dn( B32_0(a[k & 0x1f]) ) ^ 
-						T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ 
-						T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ 
-						T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ 
-						T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ 
-						T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ 
-						T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ 
+			t[k + 1] =	T0dn( B32_0(a[k & 0x1f]) ) ^
+						T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^
+						T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^
+						T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^
+						T0up( B32_0(a[(k + 9) & 0x1f]) ) ^
+						T1up( B32_1(a[(k + 11) & 0x1f]) ) ^
+						T2up( B32_2(a[(k + 13) & 0x1f]) ) ^
 						T3up( B32_3(a[(k + 23) & 0x1f]) );
 		}
 #pragma unroll 32
@@ -645,22 +645,22 @@ __device__ void groestl512_perm_Q(uint32_t *a)
 #pragma unroll 16
 		for(int k=0;k<32;k+=2)
 		{
-			t[k + 0] =	T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ 
-						T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ 
-						T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ 
-						T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ 
-						T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ 
-						T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ 
-						T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ 
+			t[k + 0] =	T0up( B32_0(a[(k + 2) & 0x1f]) ) ^
+						T1up( B32_1(a[(k + 6) & 0x1f]) ) ^
+						T2up( B32_2(a[(k + 10) & 0x1f]) ) ^
+						T3up( B32_3(a[(k + 22) & 0x1f]) ) ^
+						T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^
+						T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^
+						T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^
 						T3dn( B32_3(a[(k + 13) & 0x1f]) );
 
-			t[k + 1] =	T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ 
-						T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ 
-						T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ 
-						T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ 
-						T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ 
-						T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ 
-						T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ 
+			t[k + 1] =	T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^
+						T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^
+						T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^
+						T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^
+						T0up( B32_0(a[(k + 1) & 0x1f]) ) ^
+						T1up( B32_1(a[(k + 5) & 0x1f]) ) ^
+						T2up( B32_2(a[(k + 9) & 0x1f]) ) ^
 						T3up( B32_3(a[(k + 13) & 0x1f]) );
 		}
 #pragma unroll 32
@@ -677,7 +677,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
 		uint32_t message[32];
 		uint32_t state[32];
 
-		// lese message ein & verknüpfe diese mit dem hash1 von hefty1
+		// lese message ein & verknÃ¼pfe diese mit dem hash1 von hefty1
 		// lese den state ein
 
 #pragma unroll 32
@@ -700,7 +700,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
 #pragma unroll 8
 		for (int k=0; k<8; ++k)
 			message[BLOCKSIZE/4+k] = heftyHash[k];
-		
+
 		uint32_t g[32];
 #pragma unroll 32
 		for(int u=0;u<32;u++)
@@ -709,7 +709,7 @@ template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32
 		// Perm
 		groestl512_perm_P(g);
 		groestl512_perm_Q(message);
-		
+
 #pragma unroll 32
 		for(int u=0;u<32;u++)
 		{
@@ -753,7 +753,7 @@ __host__ void groestl512_cpu_init(int thr_id, int threads)
 	texDef(t3up, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
 	texDef(t3dn, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
 
-	// Speicher für alle Ergebnisse belegen
+	// Speicher fÃ¼r alle Ergebnisse belegen
 	cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
 }
 
@@ -778,31 +778,27 @@ __host__ void groestl512_cpu_setBlock(void *data, int len)
 		msgBlock[28] = 0x80;
 		msgBlock[31] = 0x01000000;
 	}
-	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
-	// auf der GPU ausgeführt)
+	// groestl512 braucht hierfÃ¼r keinen CPU-Code (die einzige Runde wird
+	// auf der GPU ausgefÃ¼hrt)
 
-	// setze register 
+	// setze register
 	uint32_t groestl_state_init[32];
 	memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
 	groestl_state_init[31] = 0x20000;
 
 	// state speichern
-	cudaMemcpyToSymbol(	groestl_gpu_state,
-						groestl_state_init,
-						128);
+	cudaMemcpyToSymbol(groestl_gpu_state, groestl_state_init, 128);
 
 	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-	cudaMemcpyToSymbol(	groestl_gpu_msg,
-						msgBlock,
-						128);
+	cudaMemcpyToSymbol(groestl_gpu_msg, msgBlock, 128);
 	BLOCKSIZE = len;
 }
 
 __host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
 {
 	// Hefty1 Hashes kopieren (eigentlich nur zum debuggen)
-	if (copy)	
-		cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );		
+	if (copy)
+		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
 }
 
 __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
@@ -813,11 +809,11 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
+	// GrÃ¶ÃŸe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE == 84)
-		groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
+		groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 80)
-		groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
+		groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 }
diff --git a/heavy/cuda_groestl512.h b/heavy/cuda_groestl512.h
deleted file mode 100644
index 0cdc13b..0000000
--- a/heavy/cuda_groestl512.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CUDA_GROESTL512_H
-#define _CUDA_GROESTL512_H
-
-void groestl512_cpu_init(int thr_id, int threads);
-void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void groestl512_cpu_setBlock(void *data, int len);
-void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
-#endif
\ No newline at end of file
diff --git a/heavy/cuda_hefty1.cu b/heavy/cuda_hefty1.cu
index 6d2b324..0ca3105 100644
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@@ -1,12 +1,14 @@
 #include <stdio.h>
 #include <memory.h>
 
+#include "miner.h"
+
 #include "cuda_helper.h"
 
 #define USE_SHARED 1
 
-// globaler Speicher für alle HeftyHashes aller Threads
-uint32_t *d_heftyHashes[8];
+// globaler Speicher fÃ¼r alle HeftyHashes aller Threads
+uint32_t *heavy_heftyHashes[8];
 
 /* Hash-Tabellen */
 __constant__ uint32_t hefty_gpu_constantTable[64];
@@ -30,7 +32,7 @@ uint32_t hefty_cpu_hashTable[] = {
     0x9b05688cUL,
     0x1f83d9abUL,
     0x5be0cd19UL };
-    
+
 uint32_t hefty_cpu_constantTable[] = {
     0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
     0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
@@ -50,11 +52,16 @@ uint32_t hefty_cpu_constantTable[] = {
     0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
 };
 
-//#define S(x, n)          (((x) >> (n)) | ((x) << (32 - (n))))
-static __host__ __device__ uint32_t S(uint32_t x, int n)
+#if 0
+#define S(x, n)          (((x) >> (n)) | ((x) << (32 - (n))))
+#else
+__host__ __device__
+static uint32_t S(uint32_t x, int n)
 {
     return (((x) >> (n)) | ((x) << (32 - (n))));
 }
+#endif
+
 #define R(x, n)          ((x) >> (n))
 #define Ch(x, y, z)      ((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)     ((x & (y | z)) | (y & z))
@@ -67,7 +74,9 @@ static __host__ __device__ uint32_t S(uint32_t x, int n)
 
 // uint8_t
 #define smoosh4(x)       ( ((x)>>4) ^ ((x) & 0x0F) )
-__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
+
+__host__ __forceinline__ __device__
+uint8_t smoosh2(uint32_t x)
 {
     uint16_t w = (x >> 16) ^ (x & 0xffff);
     uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
@@ -77,13 +86,14 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
 #define smoosh4Quad(x)   ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
 #define getByte(x,y)     ( ((x) >> (y)) & 0xFF )
 
-__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
+__host__ __forceinline__ __device__
+void Mangle(uint32_t *inp)
 {
     uint32_t r = smoosh4Quad(inp[0]);
     uint32_t inp0org;
     uint32_t tmp0Mask, tmp1Mask;
     uint32_t in1, in2, isAddition;
-    uint32_t tmp;
+    int32_t tmp;
     uint8_t b;
 
     inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
@@ -92,24 +102,24 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
     tmp = smoosh2(inp[1]);
     b = getByte(r,tmp);
     inp0org = S(inp[0], b);
-    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
-    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
-    
-    in1 =    (inp[2] & ~inp0org) | 
+    tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
+    tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
+
+    in1 =    (inp[2] & ~inp0org) |
             (tmp1Mask & ~inp[2] & inp0org) |
             (~tmp0Mask & ~inp[2] & inp0org);
     in2 = inp[2] += ~inp0org;
     isAddition = ~tmp0Mask & tmp1Mask;
     inp[2] = isAddition ? in2 : in1;
-    
+
     r += 0x01010101;
     tmp = smoosh2(inp[1] ^ inp[2]);
     b = getByte(r,tmp);
     inp0org = S(inp[0], b);
-    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
-    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
+    tmp0Mask = (uint32_t) -((tmp >> 3) & 1); // Bit 3 an Position 0
+    tmp1Mask = (uint32_t) -((tmp >> 4) & 1); // Bit 4 an Position 0
 
-    in1 =    (inp[3] & ~inp0org) | 
+    in1 =    (inp[3] & ~inp0org) |
             (tmp1Mask & ~inp[3] & inp0org) |
             (~tmp0Mask & ~inp[3] & inp0org);
     in2 = inp[3] += ~inp0org;
@@ -119,20 +129,23 @@ __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
     inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
 }
 
-__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
+__host__ __forceinline__ __device__
+void Absorb(uint32_t *inp, uint32_t x)
 {
     inp[0] ^= x;
     Mangle(inp);
 }
 
-__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
+__host__ __forceinline__ __device__
+uint32_t Squeeze(uint32_t *inp)
 {
     uint32_t y = inp[0];
     Mangle(inp);
     return y;
 }
 
-__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
+__host__ __forceinline__ __device__
+uint32_t Br(uint32_t *sponge, uint32_t x)
 {
     uint32_t r = Squeeze(sponge);
     uint32_t t = ((r >> 8) & 0x1F);
@@ -146,11 +159,12 @@ __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
     return retVal;
 }
 
-__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
+__device__ __forceinline__
+void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
 {
     uint32_t tmpBr;
 
-    uint32_t brG = Br(sponge, regs[6]);    
+    uint32_t brG = Br(sponge, regs[6]);
     uint32_t brF = Br(sponge, regs[5]);
     uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
     uint32_t brE = Br(sponge, regs[4]);
@@ -169,11 +183,12 @@ __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint
     regs[4] += tmpBr;
 }
 
-__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
+__host__
+void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
 {
     uint32_t tmpBr;
 
-    uint32_t brG = Br(sponge, regs[6]);    
+    uint32_t brG = Br(sponge, regs[6]);
     uint32_t brF = Br(sponge, regs[5]);
     uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
     uint32_t brE = Br(sponge, regs[4]);
@@ -191,11 +206,11 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *
     regs[4] += tmpBr;
 }
 
-// Die Hash-Funktion
-__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
+__global__
+void hefty_gpu_hash(int threads, uint32_t startNounce, uint32_t *outputHash)
 {
-    #if USE_SHARED
-    extern __shared__ char heftytab[];
+#if USE_SHARED
+    extern __shared__ unsigned char heftytab[];
     if(threadIdx.x < 64)
     {
         *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
@@ -207,9 +222,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
     int thread = (blockDim.x * blockIdx.x + threadIdx.x);
     if (thread < threads)
     {
-        // bestimme den aktuellen Zähler
+        // bestimme den aktuellen ZÃ¤hler
         uint32_t nounce = startNounce + thread;
-    
+
         // jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
         // reduktion von 256 byte auf 128 byte
         uint32_t W1[16];
@@ -219,7 +234,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
         uint32_t regs[8];
         uint32_t hash[8];
         uint32_t sponge[4];
-    
+
 #pragma unroll 4
         for(int k=0; k < 4; k++)
             sponge[k] = hefty_gpu_sponge[k];
@@ -231,7 +246,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
             regs[k] = hefty_gpu_register[k];
             hash[k] = regs[k];
         }
-    
+
         //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
 #pragma unroll 16
         for(int k=0;k<16;k++)
@@ -252,7 +267,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
         }
 
 // Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
-        
+
 #pragma unroll 3
         for(int k=0;k<3;k++)
         {
@@ -279,7 +294,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
             for(int j=0;j<16;j++)
                 W1[j] = W2[j];
         }
-        
+
 #pragma unroll 8
         for(int k=0;k<8;k++)
             hash[k] += regs[k];
@@ -290,27 +305,28 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
     }
 }
 
-// Setup-Funktionen
-__host__ void hefty_cpu_init(int thr_id, int threads)
+__host__
+void hefty_cpu_init(int thr_id, int threads)
 {
     cudaSetDevice(device_map[thr_id]);
 
     // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol(    hefty_gpu_constantTable,
+    cudaMemcpyToSymbol( hefty_gpu_constantTable,
                         hefty_cpu_constantTable,
                         sizeof(uint32_t) * 64 );
 
-    // Speicher für alle Hefty1 hashes belegen
-    cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
+    // Speicher fÃ¼r alle Hefty1 hashes belegen
+    CUDA_SAFE_CALL(cudaMalloc(&heavy_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads));
 }
 
-__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
+__host__
+void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
 // data muss 80/84-Byte haben!
 {
     // Nachricht expandieren und setzen
     uint32_t msgBlock[32];
 
-    memset(msgBlock, 0, sizeof(uint32_t) * 32);
+    memset(msgBlock, 0, sizeof(msgBlock));
     memcpy(&msgBlock[0], data, len);
     if (len == 84) {
         msgBlock[21] |= 0x80;
@@ -319,17 +335,17 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
         msgBlock[20] |= 0x80;
         msgBlock[31] = 640; // bitlen
     }
-    
+
     for(int i=0;i<31;i++) // Byteorder drehen
         msgBlock[i] = SWAB32(msgBlock[i]);
 
-    // die erste Runde wird auf der CPU durchgeführt, da diese für
+    // die erste Runde wird auf der CPU durchgefÃ¼hrt, da diese fÃ¼r
     // alle Threads gleich ist. Der Hash wird dann an die Threads
-    // übergeben
+    // Ã¼bergeben
 
     // Erstelle expandierten Block W
-    uint32_t W[64];    
-    memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);    
+    uint32_t W[64];
+    memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
     for(int j=16;j<64;j++)
         W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
 
@@ -344,7 +360,7 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
     {
         regs[k] = hefty_cpu_hashTable[k];
         hash[k] = regs[k];
-    }    
+    }
 
     // 1. Runde
     for(int j=0;j<16;j++)
@@ -366,39 +382,30 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
         hash[k] += regs[k];
 
     // sponge speichern
-
-    cudaMemcpyToSymbol( hefty_gpu_sponge,
-                        sponge,
-                        sizeof(uint32_t) * 4 );
+    cudaMemcpyToSymbol(hefty_gpu_sponge, sponge, 16);
     // hash speichern
-    cudaMemcpyToSymbol( hefty_gpu_register,
-                        hash,
-                        sizeof(uint32_t) * 8 );
-
+    cudaMemcpyToSymbol(hefty_gpu_register, hash, 32);
     // Blockheader setzen (korrekte Nonce fehlt da drin noch)
-    cudaMemcpyToSymbol( hefty_gpu_blockHeader,
-                        &msgBlock[16],
-                        64);
+    CUDA_SAFE_CALL(cudaMemcpyToSymbol(hefty_gpu_blockHeader, &msgBlock[16], 64));
 }
 
-__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
+__host__
+void hefty_cpu_hash(int thr_id, int threads, int startNounce)
 {
-    // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
-    // alle anderen mit 512 Threads.
-    int threadsperblock = (device_sm[device_map[thr_id]] >= 300) ? 768 : 512;
+    int threadsperblock = 256;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
     dim3 block(threadsperblock);
 
-    // Größe des dynamischen Shared Memory Bereichs
-    #if USE_SHARED
-    size_t shared_size = 8 * 64 * sizeof(uint32_t);
+    // GrÃ¶ÃŸe des dynamischen Shared Memory Bereichs
+#if USE_SHARED
+    int shared_size = 8 * 64 * sizeof(uint32_t);
 #else
-    size_t shared_size = 0;
+    int shared_size = 0;
 #endif
 
-    hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
+    hefty_gpu_hash <<< grid, block, shared_size >>> (threads, startNounce, heavy_heftyHashes[thr_id]);
 
     // Strategisches Sleep Kommando zur Senkung der CPU Last
     MyStreamSynchronize(NULL, 0, thr_id);
diff --git a/heavy/cuda_hefty1.h b/heavy/cuda_hefty1.h
deleted file mode 100644
index 17b196c..0000000
--- a/heavy/cuda_hefty1.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _CUDA_HEFTY1_H
-#define _CUDA_HEFTY1_H
-
-void hefty_cpu_hash(int thr_id, int threads, int startNounce);
-void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
-void hefty_cpu_init(int thr_id, int threads);
-
-#endif
\ No newline at end of file
diff --git a/heavy/cuda_keccak512.cu b/heavy/cuda_keccak512.cu
index 94aadf9..8c96b66 100644
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@@ -3,11 +3,11 @@
 
 #include "cuda_helper.h"
 
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher fÃ¼r alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
 
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher fÃ¼r unsere Ergebnisse
 uint32_t *d_hash3output[8];
 extern uint32_t *d_hash4output[8];
 extern uint32_t *d_hash5output[8];
@@ -15,13 +15,11 @@ extern uint32_t *d_hash5output[8];
 // der Keccak512 State nach der ersten Runde (72 Bytes)
 __constant__ uint64_t c_State[25];
 
-// die Message (72 Bytes) für die zweite Runde auf der GPU
+// die Message (72 Bytes) fÃ¼r die zweite Runde auf der GPU
 __constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (Nonce at offset 4) plus padding
 
 // ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
 
-#include "cuda_helper.h"
-
 #define U32TO64_LE(p) \
 	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
 
@@ -144,7 +142,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// bestimme den aktuellen Zähler
+		// bestimme den aktuellen ZÃ¤hler
 		//uint32_t nounce = startNounce + thread;
 		uint32_t nounce = nonceVector[thread];
 
@@ -156,7 +154,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
 #pragma unroll 25
 		for (int i=0; i < 25; ++i)
 			keccak_gpu_state[i] = c_State[i];
-	
+
 		// Message2 in den Puffer holen
 		uint32_t msgBlock[18];
 		mycpy72(msgBlock, c_PaddedMessage2);
@@ -167,7 +165,7 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
 		// den individuellen Hefty1 Hash einsetzen
 		mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
 
-		// den Block einmal gut durchschütteln
+		// den Block einmal gut durchschÃ¼tteln
 		keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
 
 		// das Hash erzeugen
@@ -187,8 +185,8 @@ template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_
 
 // ---------------------------- END CUDA keccak512 functions ------------------------------------
 
-// Setup-Funktionen
-__host__ void keccak512_cpu_init(int thr_id, int threads)
+__host__ 
+void keccak512_cpu_init(int thr_id, int threads)
 {
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	cudaMemcpyToSymbol( c_keccak_round_constants,
@@ -196,7 +194,7 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
 						sizeof(host_keccak_round_constants),
 						0, cudaMemcpyHostToDevice);
 
-	// Speicher für alle Ergebnisse belegen
+	// Speicher fÃ¼r alle Ergebnisse belegen
 	cudaMalloc(&d_hash3output[thr_id], 16 * sizeof(uint32_t) * threads);
 }
 
@@ -212,23 +210,24 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
 
 static int BLOCKSIZE = 84;
 
-__host__ void keccak512_cpu_setBlock(void *data, int len)
+__host__
+void keccak512_cpu_setBlock(void *data, int len)
 	// data muss 80 oder 84-Byte haben!
 	// heftyHash hat 32-Byte
 {
 	// CH
-	// state init	
+	// state init
 	uint64_t keccak_cpu_state[25];
 	memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state));
 
-	// erste Runde	
+	// erste Runde
 	keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
 
 	// state kopieren
 	cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 
-	// keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
-	// zu jeweils 
+	// keccak hat 72-Byte blÃ¶cke, d.h. in unserem Fall zwei BlÃ¶cke
+	// zu jeweils
 	uint32_t msgBlock[18];
 	memset(msgBlock, 0, 18 * sizeof(uint32_t));
 
@@ -238,29 +237,31 @@ __host__ void keccak512_cpu_setBlock(void *data, int len)
 	else if (len == 80)
 		memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
 
-	// Nachricht abschließen
+	// Nachricht abschlieÃŸen
 	if (len == 84)
 		msgBlock[11] = 0x01;
 	else if (len == 80)
 		msgBlock[10] = 0x01;
 	msgBlock[17] = 0x80000000;
-	
-	// Message 2 ins Constant Memory kopieren (die variable Nonce und 
+
+	// Message 2 ins Constant Memory kopieren (die variable Nonce und
 	// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
 	cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
 
 	BLOCKSIZE = len;
 }
 
-
-__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
+__host__
+void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
 {
 	// Hefty1 Hashes kopieren
-	if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+	if (copy)
+		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
 	//else cudaThreadSynchronize();
 }
 
-__host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
+__host__
+void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 {
 	const int threadsperblock = 128;
 
@@ -268,11 +269,11 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
+	// GrÃ¶ÃŸe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE==84)
-		keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE==80)
-		keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 }
diff --git a/heavy/cuda_keccak512.h b/heavy/cuda_keccak512.h
deleted file mode 100644
index 1182447..0000000
--- a/heavy/cuda_keccak512.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _CUDA_KECCAK512_H
-#define _CUDA_KECCAK512_H
-
-void keccak512_cpu_init(int thr_id, int threads);
-void keccak512_cpu_setBlock(void *data, int len);
-void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
-#endif
diff --git a/heavy/cuda_sha256.cu b/heavy/cuda_sha256.cu
index 043422b..3b63b76 100644
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@@ -3,11 +3,11 @@
 
 #include "cuda_helper.h"
 
-// globaler Speicher für alle HeftyHashes aller Threads
-extern uint32_t *d_heftyHashes[8];
-extern uint32_t *d_nonceVector[8];
+// globaler Speicher fÃ¼r alle HeftyHashes aller Threads
+extern uint32_t *heavy_heftyHashes[8];
+extern uint32_t *heavy_nonceVector[8];
 
-// globaler Speicher für unsere Ergebnisse
+// globaler Speicher fÃ¼r unsere Ergebnisse
 uint32_t *d_hash2output[8];
 
 
@@ -47,10 +47,10 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// bestimme den aktuellen Zähler
+		// bestimme den aktuellen ZÃ¤hler
 		uint32_t nounce = startNounce + thread;
 		nonceVector[thread] = nounce;
-	
+
 		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
 		uint32_t W1[16];
 		uint32_t W2[16];
@@ -66,10 +66,10 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 			regs[k] = sha256_gpu_register[k];
 			hash[k] = regs[k];
 		}
-	
+
 		// 2. Runde
-		//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei Teilblöcke
-		//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen		
+		//memcpy(W, &sha256_gpu_blockHeader[0], sizeof(uint32_t) * 16); // TODO: aufsplitten in zwei TeilblÃ¶cke
+		//memcpy(&W[5], &heftyHashes[8 * (blockDim.x * blockIdx.x + threadIdx.x)], sizeof(uint32_t) * 8); // den richtigen Hefty1 Hash holen
 #pragma unroll 16
 		for(int k=0;k<16;k++)
 			W1[k] = sha256_gpu_blockHeader[k];
@@ -90,7 +90,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 			uint32_t T1, T2;
 			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W1[j];
 			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+
 			#pragma unroll 7
 			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
 			regs[0] = T1 + T2;
@@ -121,7 +121,7 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 				uint32_t T1, T2;
 				T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
 				T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+
 				#pragma unroll 7
 				for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
 				regs[0] = T1 + T2;
@@ -136,14 +136,14 @@ template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t s
 /*
 		for(int j=16;j<64;j++)
 			W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
-	
+
 #pragma unroll 64
 		for(int j=0;j<64;j++)
 		{
 			uint32_t T1, T2;
 			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_gpu_constantTable[j] + W[j];
 			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+
 			#pragma unroll 7
 			for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
 			regs[0] = T1 + T2;
@@ -168,7 +168,7 @@ __host__ void sha256_cpu_init(int thr_id, int threads)
 						sha256_cpu_constantTable,
 						sizeof(uint32_t) * 64 );
 
-	// Speicher für alle Ergebnisse belegen
+	// Speicher fÃ¼r alle Ergebnisse belegen
 	cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
 }
 
@@ -184,25 +184,25 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
 	memset(msgBlock, 0, sizeof(uint32_t) * 32);
 	memcpy(&msgBlock[0], data, len);
 	if (len == 84) {
-		memset(&msgBlock[21], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
+		memset(&msgBlock[21], 0, 32); // vorlÃ¤ufig  Nullen anstatt der Hefty1 Hashes einfÃ¼llen
 		msgBlock[29] |= 0x80;
 		msgBlock[31] = 928; // bitlen
 	} else if (len == 80) {
-		memset(&msgBlock[20], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
+		memset(&msgBlock[20], 0, 32); // vorlÃ¤ufig  Nullen anstatt der Hefty1 Hashes einfÃ¼llen
 		msgBlock[28] |= 0x80;
 		msgBlock[31] = 896; // bitlen
 	}
-	
+
 	for(int i=0;i<31;i++) // Byteorder drehen
 		msgBlock[i] = SWAB32(msgBlock[i]);
 
-	// die erste Runde wird auf der CPU durchgeführt, da diese für
+	// die erste Runde wird auf der CPU durchgefÃ¼hrt, da diese fÃ¼r
 	// alle Threads gleich ist. Der Hash wird dann an die Threads
-	// übergeben
+	// Ã¼bergeben
 	uint32_t W[64];
 
 	// Erstelle expandierten Block W
-	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);	
+	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
 	for(int j=16;j<64;j++)
 		W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
 
@@ -223,7 +223,7 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
 		uint32_t T1, T2;
 		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_cpu_constantTable[j] + W[j];
 		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+
 		//#pragma unroll 7
 		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
 		// sollte mal noch durch memmov ersetzt werden!
@@ -251,7 +251,8 @@ __host__ void sha256_cpu_setBlock(void *data, int len)
 __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
 {
 	// Hefty1 Hashes kopieren
-	if (copy) cudaMemcpy( d_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice );
+	if (copy)
+		CUDA_SAFE_CALL(cudaMemcpy(heavy_heftyHashes[thr_id], heftyHashes, 8 * sizeof(uint32_t) * threads, cudaMemcpyHostToDevice));
 	//else cudaThreadSynchronize();
 }
 
@@ -263,12 +264,12 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
+	// GrÃ¶ÃŸe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 
 	if (BLOCKSIZE == 84)
-		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 80) {
-		sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+		sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], heavy_heftyHashes[thr_id], heavy_nonceVector[thr_id]);
 	}
 }
diff --git a/heavy/cuda_sha256.h b/heavy/cuda_sha256.h
deleted file mode 100644
index 03385d1..0000000
--- a/heavy/cuda_sha256.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _CUDA_SHA256_H
-#define _CUDA_SHA256_H
-
-void sha256_cpu_init(int thr_id, int threads);
-void sha256_cpu_setBlock(void *data, int len);
-void sha256_cpu_hash(int thr_id, int threads, int startNounce);
-void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-#endif
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index f3891e8..eb9a03e 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -1,35 +1,19 @@
 #include <stdio.h>
-#include <memory.h>
-#include <string.h>
-
-#include <map>
-
 #include <openssl/sha.h>
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
+#include <map>
 // include thrust
-#include <thrust/version.h>
 #include <thrust/remove.h>
 #include <thrust/device_vector.h>
-#include <thrust/iterator/constant_iterator.h>
 
 #include "miner.h"
 
-#include "hefty1.h"
+extern "C" {
 #include "sph/sph_keccak.h"
 #include "sph/sph_blake.h"
 #include "sph/sph_groestl.h"
-
-#include "heavy/cuda_hefty1.h"
-#include "heavy/cuda_sha256.h"
-#include "heavy/cuda_keccak512.h"
-#include "heavy/cuda_groestl512.h"
-#include "heavy/cuda_blake512.h"
-#include "heavy/cuda_combine.h"
-
+}
+#include "hefty1.h"
+#include "heavy/heavy.h"
 #include "cuda_helper.h"
 
 extern uint32_t *d_hash2output[8];
@@ -37,11 +21,13 @@ extern uint32_t *d_hash3output[8];
 extern uint32_t *d_hash4output[8];
 extern uint32_t *d_hash5output[8];
 
-#define HEAVYCOIN_BLKHDR_SZ        84
-#define MNR_BLKHDR_SZ		       80
+#define HEAVYCOIN_BLKHDR_SZ 84
+#define MNR_BLKHDR_SZ       80
+
+// nonce-array fÃ¼r die threads
+uint32_t *heavy_nonceVector[8];
 
-// nonce-array für die threads
-uint32_t *d_nonceVector[8];
+extern uint32_t *heavy_heftyHashes[8];
 
 /* Combines top 64-bits from each hash into a single hash */
 static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
@@ -71,9 +57,9 @@ static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t
 #include <intrin.h>
 static uint32_t __inline bitsset( uint32_t x )
 {
-   DWORD r = 0;
-   _BitScanReverse(&r, x);
-   return r;
+    DWORD r = 0;
+    _BitScanReverse(&r, x);
+    return r;
 }
 #else
 static uint32_t bitsset( uint32_t x )
@@ -91,21 +77,21 @@ static int findhighbit(const uint32_t *ptarget, int words)
     {
         if (ptarget[i] != 0) {
             highbit = i*32 + bitsset(ptarget[i])+1;
-                break;
+            break;
         }
     }
     return highbit;
 }
 
 // Generiere ein Multiword-Integer das die Zahl
-// (2 << highbit) - 1 repräsentiert.
+// (2 << highbit) - 1 reprÃ¤sentiert.
 static void genmask(uint32_t *ptarget, int words, int highbit)
 {
     int i;
     for (i=words-1; i >= 0; --i)
     {
         if ((i+1)*32 <= highbit)
-            ptarget[i] = 0xffffffff;
+            ptarget[i] = UINT32_MAX;
         else if (i*32 > highbit)
             ptarget[i] = 0x00000000;
         else
@@ -114,13 +100,18 @@ static void genmask(uint32_t *ptarget, int words, int highbit)
 }
 
 struct check_nonce_for_remove
-{    
+{
     check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
         m_target(target),
         m_hashes(hashes),
         m_hashlen(hashlen),
         m_startNonce(startNonce) { }
 
+    uint64_t  m_target;
+    uint32_t *m_hashes;
+    uint32_t  m_hashlen;
+    uint32_t  m_startNonce;
+
     __device__
     bool operator()(const uint32_t x)
     {
@@ -129,53 +120,39 @@ struct check_nonce_for_remove
         // Wert des Hashes (als uint64_t) auslesen.
         // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
         uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
-        // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
-        return (hashValue & m_target) != hashValue;
+        bool res = (hashValue & m_target) != hashValue;
+        //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
+        // gegen das Target prÃ¼fen. Es dÃ¼rfen nur Bits aus dem Target gesetzt sein.
+        return res;
     }
-
-    uint64_t  m_target;
-    uint32_t *m_hashes;
-    uint32_t  m_hashlen;
-    uint32_t  m_startNonce;
 };
 
-int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done, uint32_t maxvote, int blocklen);
+static bool init[8] = {0,0,0,0,0,0,0,0};
 
-extern "C"
+__host__
 int scanhash_heavy(int thr_id, uint32_t *pdata,
     const uint32_t *ptarget, uint32_t max_nonce,
     unsigned long *hashes_done, uint32_t maxvote, int blocklen)
-{
-    return scanhash_heavy_cpp(thr_id, pdata,
-    ptarget, max_nonce, hashes_done, maxvote, blocklen);
-}
-
-
-int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
-    const uint32_t *ptarget, uint32_t max_nonce,
-    unsigned long *hashes_done, uint32_t maxvote, int blocklen)
 {
     const uint32_t first_nonce = pdata[19]; /* to check */
     // CUDA will process thousands of threads.
-    int throughput = opt_work_size ? opt_work_size : (1 << 19); // 128*4096
-	throughput = min(throughput, (int)(max_nonce - first_nonce));
+    int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*2048
+    throughput = min(throughput, (int)(max_nonce - first_nonce));
 
     int rc = 0;
     uint32_t *hash = NULL;
-    cudaMallocHost(&hash, throughput*8*sizeof(uint32_t));
     uint32_t *cpu_nonceVector = NULL;
-    cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t));
+    CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)));
+    CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)));
 
     int nrmCalls[6];
     memset(nrmCalls, 0, sizeof(int) * 6);
 
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x000000ff;
+    if (opt_benchmark)
+        ((uint32_t*)ptarget)[7] = 0x00ff;
 
-    // für jeden Hash ein individuelles Target erstellen basierend
-    // auf dem höchsten Bit, das in ptarget gesetzt ist.
+    // fÃ¼r jeden Hash ein individuelles Target erstellen basierend
+    // auf dem hÃ¶chsten Bit, das in ptarget gesetzt ist.
     int highbit = findhighbit(ptarget, 8);
     uint32_t target2[2], target3[2], target4[2], target5[2];
     genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
@@ -183,7 +160,6 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
     genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
     genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
 
-    static bool init[8] = {0,0,0,0,0,0,0,0};
     if (!init[thr_id])
     {
         hefty_cpu_init(thr_id, throughput);
@@ -192,8 +168,10 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
         groestl512_cpu_init(thr_id, throughput);
         blake512_cpu_init(thr_id, throughput);
         combine_cpu_init(thr_id, throughput);
+
+        CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
+
         init[thr_id] = true;
-        cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput);
     }
 
     if (blocklen == HEAVYCOIN_BLKHDR_SZ)
@@ -201,13 +179,13 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
         uint16_t *ext = (uint16_t *)&pdata[20];
 
         if (opt_vote > maxvote) {
-            printf("Warning: Your block reward vote (%hu) exceeds "
-                    "the maxvote reported by the pool (%hu).\n",
+            applog(LOG_WARNING, "Your block reward vote (%hu) exceeds "
+                    "the maxvote reported by the pool (%hu).",
                     opt_vote, maxvote);
         }
 
         if (opt_trust_pool && opt_vote > maxvote) {
-            printf("Warning: Capping block reward vote to maxvote reported by pool.\n");
+            applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
             ext[0] = maxvote;
         }
         else
@@ -222,32 +200,34 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
     blake512_cpu_setBlock(pdata, blocklen);
 
     do {
-        uint32_t i;
 
         ////// Compaction init
-        thrust::device_ptr<uint32_t> devNoncePtr(d_nonceVector[thr_id]);
-        thrust::device_ptr<uint32_t> devNoncePtrEnd((d_nonceVector[thr_id]) + throughput);
+        thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
+        thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
         uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
+        uint64_t *t;
 
         hefty_cpu_hash(thr_id, throughput, pdata[19]);
         //cudaThreadSynchronize();
         sha256_cpu_hash(thr_id, throughput, pdata[19]);
         //cudaThreadSynchronize();
 
-        // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
+        // Hier ist die lÃ¤ngste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
         MyStreamSynchronize(NULL, 1, thr_id);
 
         ////// Compaction
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target2), d_hash2output[thr_id], 8, pdata[19]));
+        t = (uint64_t*) target2;
+        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
         actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
         if(actualNumberOfValuesInNonceVectorGPU == 0)
             goto emptyNonceVector;
-        
+
         keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
         //cudaThreadSynchronize();
 
         ////// Compaction
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target3), d_hash3output[thr_id], 16, pdata[19]));
+        t = (uint64_t*) target3;
+        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
         actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
         if(actualNumberOfValuesInNonceVectorGPU == 0)
             goto emptyNonceVector;
@@ -256,7 +236,8 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
         //cudaThreadSynchronize();
 
         ////// Compaction
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target5), d_hash5output[thr_id], 16, pdata[19]));
+        t = (uint64_t*) target5;
+        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
         actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
         if(actualNumberOfValuesInNonceVectorGPU == 0)
             goto emptyNonceVector;
@@ -265,25 +246,31 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
         //cudaThreadSynchronize();
 
         ////// Compaction
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*((uint64_t*)target4), d_hash4output[thr_id], 16, pdata[19]));
+        t = (uint64_t*) target4;
+        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
         actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
         if(actualNumberOfValuesInNonceVectorGPU == 0)
             goto emptyNonceVector;
-        
+
         // combine
         combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
 
+        if (opt_tracegpu) {
+            applog(LOG_BLUE, "heavy GPU hash:");
+            applog_hash((uchar*)hash);
+        }
+
         // Ergebnisse kopieren
         if(actualNumberOfValuesInNonceVectorGPU > 0)
         {
-            cudaMemcpy(cpu_nonceVector, d_nonceVector[thr_id], sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU, cudaMemcpyDeviceToHost);
+            size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
+            CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost));
+            cudaDeviceSynchronize();
 
-            for (i=0; i<actualNumberOfValuesInNonceVectorGPU;++i)
+            for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
             {
                 uint32_t nonce = cpu_nonceVector[i];
-                //uint32_t index = nonce - pdata[19];
-                uint32_t index = i;
-                uint32_t *foundhash = &hash[8*index];
+                uint32_t *foundhash = &hash[8*i];
                 if (foundhash[7] <= ptarget[7]) {
                     if (fulltest(foundhash, ptarget)) {
                         uint32_t verification[8];
@@ -291,9 +278,7 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
                         heavycoin_hash((uchar*)verification, (uchar*)pdata, blocklen);
                         if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
                             applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
-                        }
-                        else
-                        {
+                        } else {
                             *hashes_done = pdata[19] - first_nonce;
                             rc = 1;
                             goto exit;
@@ -316,6 +301,7 @@ exit:
     return rc;
 }
 
+__host__
 void heavycoin_hash(uchar* output, const uchar* input, int len)
 {
     unsigned char hash1[32];
diff --git a/heavy/heavy.h b/heavy/heavy.h
new file mode 100644
index 0000000..de84344
--- /dev/null
+++ b/heavy/heavy.h
@@ -0,0 +1,30 @@
+#ifndef _CUDA_HEAVY_H
+#define _CUDA_HEAVY_H
+
+void blake512_cpu_init(int thr_id, int threads);
+void blake512_cpu_setBlock(void *pdata, int len);
+void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
+
+void groestl512_cpu_init(int thr_id, int threads);
+void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
+void groestl512_cpu_setBlock(void *data, int len);
+void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
+
+void hefty_cpu_hash(int thr_id, int threads, int startNounce);
+void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
+void hefty_cpu_init(int thr_id, int threads);
+
+void keccak512_cpu_init(int thr_id, int threads);
+void keccak512_cpu_setBlock(void *data, int len);
+void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
+void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
+
+void sha256_cpu_init(int thr_id, int threads);
+void sha256_cpu_setBlock(void *data, int len);
+void sha256_cpu_hash(int thr_id, int threads, int startNounce);
+void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
+
+void combine_cpu_init(int thr_id, int threads);
+void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *hash);
+
+#endif
diff --git a/stats.cpp b/stats.cpp
index 6b87b05..f5a2d82 100644
--- a/stats.cpp
+++ b/stats.cpp
@@ -37,7 +37,7 @@ void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8
 		return;
 
 	memset(&data, 0, sizeof(data));
-	data.uid = uid;
+	data.uid = (uint32_t) uid;
 	data.gpu_id = gpu;
 	data.thr_id = (uint8_t)thr_id;
 	data.tm_stat = (uint32_t) time(NULL);
diff --git a/util.cpp b/util.cpp
index 3f6842e..3bc0dab 100644
--- a/util.cpp
+++ b/util.cpp
@@ -102,10 +102,8 @@ void applog(int prio, const char *fmt, ...)
 		const char* color = "";
 		char *f;
 		int len;
-		time_t now;
 		struct tm tm, *tm_p;
-
-		time(&now);
+		time_t now = time(NULL);
 
 		pthread_mutex_lock(&applog_lock);
 		tm_p = localtime(&now);
@@ -735,9 +733,7 @@ char *stratum_recv_line(struct stratum_ctx *sctx)
 
 	if (!strstr(sctx->sockbuf, "\n")) {
 		bool ret = true;
-		time_t rstart;
-
-		time(&rstart);
+		time_t rstart = time(NULL);
 		if (!socket_full(sctx->sock, 60)) {
 			applog(LOG_ERR, "stratum_recv_line timed out");
 			goto out;
@@ -1578,16 +1574,18 @@ void do_gpu_tests(void)
 	uchar buf[128];
 	uint32_t tgt[8] = { 0 };
 
-	memset(buf, 0, sizeof buf);
-	buf[0] = 1; buf[64] = 2;
-
 	opt_tracegpu = true;
 	work_restart = (struct work_restart*) malloc(sizeof(struct work_restart));
 	work_restart[0].restart = 1;
-	tgt[6] = 0xffff;
+	tgt[7] = 0xffff;
 
+	memset(buf, 0, sizeof buf);
+	// buf[0] = 1; buf[64] = 2; // for endian tests
 	scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
 
+	memset(buf, 0, sizeof buf);
+	scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
+
 	free(work_restart);
 	work_restart = NULL;
 	opt_tracegpu = false;