diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index 2feb32e..419c1a5 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -5,9 +5,11 @@ extern "C"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
+}
+
 #include "miner.h"
+
 #include "cuda_helper.h"
-}
 
 // aus cpu-miner.c
 extern int device_map[8];
@@ -74,9 +76,6 @@ extern "C" void nist5hash(void *state, const void *input)
     memcpy(state, hash, 32);
 }
 
-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
     const uint32_t *ptarget, uint32_t max_nonce,
     unsigned long *hashes_done)
@@ -84,7 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t first_nonce = pdata[19];
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		((uint32_t*)ptarget)[7] = 0x00FF;
 
 	const uint32_t Htarg = ptarget[7];
 
diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu
index e3d299d..787b8a0 100644
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@@ -50,59 +50,60 @@ const uint64_t c_u512[16] =
   0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
 };
 
-#define G(a,b,c,d,e)          \
-    v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = ROTR( v[d] ^ v[a],32);        \
-    v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],25);        \
-    v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
-    v[d] = ROTR( v[d] ^ v[a],16);        \
-    v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],11);
-
+#define G(a,b,c,d,x) { \
+	uint32_t idx1 = sigma[i][x]; \
+	uint32_t idx2 = sigma[i][x+1]; \
+	v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
+	v[d] = ROTR( v[d] ^ v[a], 32); \
+	v[c] += v[d]; \
+	v[b] = ROTR( v[b] ^ v[c], 25); \
+	v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \
+	v[d] = ROTR( v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR( v[b] ^ v[c], 11); \
+}
 
 __device__ static
-void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
+void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
 {
     uint64_t v[16], m[16], i;
 
-#pragma unroll 16
-    for( i = 0; i < 16; ++i ) {
-        m[i] = cuda_swab64(block[i]);
-    }
-
-#pragma unroll 8
-    for( i = 0; i < 8; ++i )  v[i] = h[i];
-
-    v[ 8] = u512[0];
-    v[ 9] = u512[1];
-    v[10] = u512[2];
-    v[11] = u512[3];
-    v[12] = u512[4];
-    v[13] = u512[5];
-    v[14] = u512[6];
-    v[15] = u512[7];
-
-    v[12] ^= bits;
-    v[13] ^= bits;
-
-//#pragma unroll 16
-    for( i = 0; i < 16; ++i )
-    {
-        /* column step */
-        G( 0, 4, 8, 12, 0 );
-        G( 1, 5, 9, 13, 2 );
-        G( 2, 6, 10, 14, 4 );
-        G( 3, 7, 11, 15, 6 );
-        /* diagonal step */
-        G( 0, 5, 10, 15, 8 );
-        G( 1, 6, 11, 12, 10 );
-        G( 2, 7, 8, 13, 12 );
-        G( 3, 4, 9, 14, 14 );
-    }
-
-#pragma unroll 16
-    for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
+	#pragma unroll 16
+	for( i = 0; i < 16; i++) {
+		m[i] = cuda_swab64(block[i]);
+	}
+
+	#pragma unroll 8
+	for (i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[ 8] = u512[0];
+	v[ 9] = u512[1];
+	v[10] = u512[2];
+	v[11] = u512[3];
+	v[12] = u512[4] ^ T0;
+	v[13] = u512[5] ^ T0;
+	v[14] = u512[6];
+	v[15] = u512[7];
+
+	//#pragma unroll 16
+	for( i = 0; i < 16; ++i )
+	{
+		/* column step */
+		G( 0, 4, 8, 12, 0 );
+		G( 1, 5, 9, 13, 2 );
+		G( 2, 6, 10, 14, 4 );
+		G( 3, 7, 11, 15, 6 );
+		/* diagonal step */
+		G( 0, 5, 10, 15, 8 );
+		G( 1, 6, 11, 12, 10 );
+		G( 2, 7, 8, 13, 12 );
+		G( 3, 4, 9, 14, 14 );
+	}
+
+	#pragma unroll 16
+	for( i = 0; i < 16; ++i )
+		h[i % 8] ^= v[i];
 }
 
 __device__ __constant__
@@ -114,7 +115,8 @@ static const uint64_t d_constMem[8] = {
 	0x510e527fade682d1ULL,
 	0x9b05688c2b3e6c1fULL,
 	0x1f83d9abfb41bd6bULL,
-	0x5be0cd19137e2179ULL };
+	0x5be0cd19137e2179ULL
+};
 
 // Hash-Padding
 __device__ __constant__
@@ -126,7 +128,8 @@ static const uint64_t d_constHashPadding[8] = {
 	0,
 	0x0100000000000000ull,
 	0,
-	0x0002000000000000ull };
+	0x0002000000000000ull
+};
 
 __global__ __launch_bounds__(256, 4)
 void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
@@ -145,48 +148,42 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
 	if (thread < threads)
 #endif
 	{
-		uint8_t i;
-		// bestimme den aktuellen Zähler
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
 		int hashPosition = nounce - startNounce;
 		uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
 
-		// 128 Byte für die Message
+		// 128 Bytes
 		uint64_t buf[16];
 
-		// State vorbereiten
+		// State
 		uint64_t h[8];
 		#pragma unroll 8
-		for (i=0;i<8;i++)
+		for (int i=0;i<8;i++)
 			h[i] = d_constMem[i];
 
-		// Message für die erste Runde in Register holen
+		// Message for first round
 		#pragma unroll 8
-		for (i=0; i < 8; ++i)
+		for (int i=0; i < 8; ++i)
 			buf[i] = inpHash[i];
 
 		#pragma unroll 8
-		for (i=0; i < 8; i++)
+		for (int i=0; i < 8; i++)
 			buf[i+8] = d_constHashPadding[i];
 
-		// die einzige Hashing-Runde
+		// Ending round
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
 
-#if __CUDA_ARCH__ >= 130
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
+#if __CUDA_ARCH__ <= 350
 		uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
 		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
+		for (int i=0; i < 8; i++) {
 			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = &g_hash[8 * hashPosition];
-		#pragma unroll 8
-		for (i=0; i < 8; ++i)
-		{
+		for (int i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64(h[i]);
 		}
 #endif
@@ -198,45 +195,38 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		// State vorbereiten
 		uint64_t h[8];
-		// 128 Byte für die Message
 		uint64_t buf[16];
-		uint8_t i;
-		// bestimme den aktuellen Zähler
 		uint32_t nounce = startNounce + thread;
 
 		#pragma unroll 8
-		for(i=0;i<8;i++)
+		for(int i=0; i<8; i++)
 			h[i] = d_constMem[i];
 
 		// Message für die erste Runde in Register holen
 		#pragma unroll 16
-		for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
+		for (int i=0; i < 16; ++i)
+			buf[i] = c_PaddedMessage80[i];
 
-		// die Nounce durch die thread-spezifische ersetzen
-		buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
+		// The test Nonce
+		((uint32_t*)buf)[19] = cuda_swab32(nounce);
 
-		// die einzige Hashing-Runde
 		quark_blake512_compress( h, buf, c_sigma, c_u512, 640 );
 
-		// Hash rauslassen
-#if __CUDA_ARCH__ >= 130
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
+#if __CUDA_ARCH__ <= 350
 		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
 		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
+		for (uint32_t i=0; i < 8; i++) {
+			outHash[2*i]   = cuda_swab32( _HIWORD(h[i]) );
 			outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
 		}
 #else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
-		#pragma unroll 8
-		for (i=0; i < 8; ++i) {
+		for (uint32_t i=0; i < 8; i++) {
 			outHash[i] = cuda_swab64( h[i] );
 		}
 #endif
+
 	}
 }
 
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index be6eda8..a905ec4 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -6,12 +6,12 @@ extern "C"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
+}
+
 #include "miner.h"
 
 #include "cuda_helper.h"
-}
 
-// aus cpu-miner.c
 extern int device_map[8];
 
 // Speicher fÃ¼r Input/Output der verketteten Hashfunktionen
@@ -70,76 +70,64 @@ extern "C" void quarkhash(void *state, const void *input)
     unsigned char hash[64];
 
     sph_blake512_init(&ctx_blake);
-    // ZBLAKE;
     sph_blake512 (&ctx_blake, input, 80);
     sph_blake512_close(&ctx_blake, (void*) hash);
     
     sph_bmw512_init(&ctx_bmw);
-    // ZBMW;
     sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
     sph_bmw512_close(&ctx_bmw, (void*) hash);
 
     if (hash[0] & 0x8)
     {
         sph_groestl512_init(&ctx_groestl);
-        // ZGROESTL;
         sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
         sph_groestl512_close(&ctx_groestl, (void*) hash);
     }
     else
     {
         sph_skein512_init(&ctx_skein);
-        // ZSKEIN;
         sph_skein512 (&ctx_skein, (const void*) hash, 64);
         sph_skein512_close(&ctx_skein, (void*) hash);
     }
     
     sph_groestl512_init(&ctx_groestl);
-    // ZGROESTL;
     sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
     sph_groestl512_close(&ctx_groestl, (void*) hash);
 
     sph_jh512_init(&ctx_jh);
-    // ZJH;
     sph_jh512 (&ctx_jh, (const void*) hash, 64);
     sph_jh512_close(&ctx_jh, (void*) hash);
 
     if (hash[0] & 0x8)
     {
         sph_blake512_init(&ctx_blake);
-        // ZBLAKE;
         sph_blake512 (&ctx_blake, (const void*) hash, 64);
         sph_blake512_close(&ctx_blake, (void*) hash);
     }
     else
     {
         sph_bmw512_init(&ctx_bmw);
-        // ZBMW;
         sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
         sph_bmw512_close(&ctx_bmw, (void*) hash);
     }
 
     sph_keccak512_init(&ctx_keccak);
-    // ZKECCAK;
     sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
     sph_keccak512_close(&ctx_keccak, (void*) hash);
 
     sph_skein512_init(&ctx_skein);
-    // SKEIN;
     sph_skein512 (&ctx_skein, (const void*) hash, 64);
     sph_skein512_close(&ctx_skein, (void*) hash);
 
     if (hash[0] & 0x8)
     {
         sph_keccak512_init(&ctx_keccak);
-        // ZKECCAK;
         sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
         sph_keccak512_close(&ctx_keccak, (void*) hash);
     }
     else
     {
         sph_jh512_init(&ctx_jh);
-        // ZJH;
         sph_jh512 (&ctx_jh, (const void*) hash, 64);
         sph_jh512_close(&ctx_jh, (void*) hash);
     }
@@ -147,23 +135,17 @@ extern "C" void quarkhash(void *state, const void *input)
     memcpy(state, hash, 32);
 }
 
-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
     const uint32_t *ptarget, uint32_t max_nonce,
     unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
+	const int throughput = 256*4096; // 100;
+	static bool init[8] = {0,0,0,0,0,0,0,0};
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
-	const uint32_t Htarg = ptarget[7];
+		((uint32_t*)ptarget)[7] = 0x00FF;
 
-	const int throughput = 256*4096; // 100;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
@@ -252,11 +234,12 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			quarkhash(vhash64, endiandata);
 
-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 
 				pdata[19] = foundNonce;
 				*hashes_done = (foundNonce - first_nonce + 1)/2;
diff --git a/x11/x11.cu b/x11/x11.cu
index 3c18030..dc2f97f 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -21,10 +21,9 @@ extern "C"
 #include <memory.h>
 }
 
-// aus cpu-miner.c
+// in cpu-miner.c
 extern int device_map[8];
 
-// Speicher fÃ¼r Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];
 
 extern void quark_blake512_cpu_init(int thr_id, int threads);
@@ -140,22 +139,17 @@ extern "C" void x11hash(void *output, const void *input)
 }
 
 
-extern bool opt_benchmark;
-
 extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
     const uint32_t *ptarget, uint32_t max_nonce,
     unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
+	const int throughput = 256*256*8;
+	static bool init[8] = {0,0,0,0,0,0,0,0};
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 
-	const uint32_t Htarg = ptarget[7];
-
-	const int throughput = 256*256*8;
-
-	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
 	{
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
@@ -186,8 +180,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
-		uint32_t foundNonce;
+		const uint32_t Htarg = ptarget[7];
+
 		int order = 0;
+		uint32_t foundNonce;
 
 		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@@ -204,7 +200,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 
 		// Scan nach Gewinner Hashes auf der GPU
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
diff --git a/x15/x14.cu b/x15/x14.cu
index 0b56584..b3519cd 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -20,11 +20,11 @@ extern "C" {
 #include "sph/sph_hamsi.h"
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
+}
 
 #include "miner.h"
 
 #include "cuda_helper.h"
-}
 
 // from cpu-miner.c
 extern int device_map[8];
@@ -167,8 +167,6 @@ extern "C" void x14hash(void *output, const void *input)
 }
 
 
-extern bool opt_benchmark;
-
 extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
diff --git a/x15/x15.cu b/x15/x15.cu
index 50e2080..faea354 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -21,14 +21,11 @@ extern "C" {
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
 #include "sph/sph_whirlpool.h"
+}
 
 #include "miner.h"
 
 #include "cuda_helper.h"
-}
-
-// to test gpu hash on a null buffer
-#define NULLTEST 0
 
 // from cpu-miner.c
 extern int device_map[8];
@@ -92,8 +89,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
 											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
 
-extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-
 // X15 CPU Hash function
 extern "C" void x15hash(void *output, const void *input)
 {
@@ -181,17 +176,6 @@ extern "C" void x15hash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }
 
-#if NULLTEST
-static void print_hash(unsigned char *hash)
-{
-	for (int i=0; i < 32; i += 4) {
-		printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
-	}
-}
-#endif
-
-extern bool opt_benchmark;
-
 extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
@@ -203,12 +187,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	uint32_t Htarg = ptarget[7];
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
-
-#if NULLTEST
-	for (int k=0; k < 20; k++)
-		pdata[k] = 0;
-#endif
+		((uint32_t*)ptarget)[7] = Htarg = 0x00FF;
 
 	if (!init[thr_id])
 	{
@@ -259,12 +238,6 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
-#if NULLTEST
-		uint32_t buf[8]; memset(buf, 0, sizeof buf);
-		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
-		CUDA_SAFE_CALL(cudaThreadSynchronize());
-		print_hash((unsigned char*)buf); printf("\n");
-#endif
 		/* Scan with GPU */
 		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
diff --git a/x17/x17.cu b/x17/x17.cu
index ffcd57a..65d2259 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -26,17 +26,15 @@ extern "C"
 
 #include "sph/sph_sha2.h"
 #include "sph/sph_haval.h"
+}
 
 #include "miner.h"
-}
+#include "cuda_helper.h"
 
 static uint32_t *d_hash[8];
 
-
-// cpu-miner.c
+// in cpu-miner.c
 extern int device_map[8];
-extern bool opt_benchmark;
-
 
 extern void quark_blake512_cpu_init(int thr_id, int threads);
 extern void quark_blake512_cpu_setBlock_80(void *pdata);
@@ -204,20 +202,12 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
 	const int throughput = 256*256*8;
-
-		if (opt_benchmark)
-				((uint32_t*)ptarget)[7] = 0x0000ff;
-
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	uint32_t Htarg = ptarget[7];
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
+		((uint32_t*)ptarget)[7] = Htarg = 0x00FF;
 
 	if (!init[thr_id])
 	{