bump to revision V1.1 with Killer Groestl

11 years ago · 3b21069504
35 changed files with 1301 additions and 1387 deletions
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@ -567,8 +567,6 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t start
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -101,14 +101,12 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 	const uint32_t Htarg = ptarget[7];
 	const int throughput = 256*4096*4; // 100;
 	//const int throughput = 256*256*2+100; // 100;
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
@ -167,16 +165,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
-		// Runde 2 (ohne Gröstl)
+		// Runde 3 (komplett)
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
-			d_branch3Nonces[thr_id], &nrm3,
+			d_branch2Nonces[thr_id], &nrm2,
 			order++);
-		// verfolge den skein-pfad weiter
+		if (nrm1+nrm2 == nrm3) {
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+			quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
 			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
@ -226,7 +226,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
 				pdata[19] = foundNonce;
-				*hashes_done = (foundNonce - first_nonce + 1)/4;
+				*hashes_done = (foundNonce - first_nonce + 1)/2;
 				//applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
 				return 1;
 			} else {
@ -238,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-	*hashes_done = (pdata[19] - first_nonce + 1)/4;
+	*hashes_done = (pdata[19] - first_nonce + 1)/2;
 	return 0;
 }
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?"
+ccMiner release 1.1 (June 14th 2014) - "Killer Groestl!"
 -------------------------------------------------------------
 ***************************************************************
@ -30,13 +30,12 @@ FugueCoin
 GroestlCoin & Myriad-Groestl
 JackpotCoin
 QuarkCoin family & AnimeCoin
 TalkCoin
 DarkCoin and other X11 coins
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL) implementations.
 X11 algo is being worked on. It will be released when we
 have achieved a nice nVidia advantage.
 We did not take a big effort on improving usability, so please set
 your parameters carefuly.
@ -140,6 +139,12 @@ features.
 >>> RELEASE HISTORY <<<
  June 14th 2014  released Killer Groestl quad version which I deem
                  sufficiently hard to port over to AMD. It isn't
                  the fastest option for Compute 3.5 and 5.0 cards,
                  but it is still much faster than the table based
                  versions.
  May 10th 2014   added X11, but without the bells & whistles
                  (no killer Groestl, SIMD hash quite slow still)
--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@ -0,0 +1,418 @@
 __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
 {
    int n = threadIdx.x % 4;
    uint32_t other[8];
 #pragma unroll 8
    for (int i = 0; i < 8; i++) {
        input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
        other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
        input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
        other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
        if (threadIdx.x & 1) {
            input[i] = __byte_perm(input[i], 0, 0x1032);
            other[i] = __byte_perm(other[i], 0, 0x1032);
        }
        output[i] = 0;
    }
    output[ 0] |=  (input[ 0] & 0x00000001);
    output[ 0] |= ((other[ 0] & 0x00000001) << 1);
    output[ 0] |= ((input[ 1] & 0x00000001) << 2);
    output[ 0] |= ((other[ 1] & 0x00000001) << 3);
    output[ 0] |= ((input[ 2] & 0x00000001) << 4);
    output[ 0] |= ((other[ 2] & 0x00000001) << 5);
    output[ 0] |= ((input[ 3] & 0x00000001) << 6);
    output[ 0] |= ((other[ 3] & 0x00000001) << 7);
    output[ 0] |= ((input[ 4] & 0x00000001) << 8);
    output[ 0] |= ((other[ 4] & 0x00000001) << 9);
    output[ 0] |= ((input[ 5] & 0x00000001) <<10);
    output[ 0] |= ((other[ 5] & 0x00000001) <<11);
    output[ 0] |= ((input[ 6] & 0x00000001) <<12);
    output[ 0] |= ((other[ 6] & 0x00000001) <<13);
    output[ 0] |= ((input[ 7] & 0x00000001) <<14);
    output[ 0] |= ((other[ 7] & 0x00000001) <<15);
    output[ 0] |= ((input[ 0] & 0x00000100) << 8);
    output[ 0] |= ((other[ 0] & 0x00000100) << 9);
    output[ 0] |= ((input[ 1] & 0x00000100) <<10);
    output[ 0] |= ((other[ 1] & 0x00000100) <<11);
    output[ 0] |= ((input[ 2] & 0x00000100) <<12);
    output[ 0] |= ((other[ 2] & 0x00000100) <<13);
    output[ 0] |= ((input[ 3] & 0x00000100) <<14);
    output[ 0] |= ((other[ 3] & 0x00000100) <<15);
    output[ 0] |= ((input[ 4] & 0x00000100) <<16);
    output[ 0] |= ((other[ 4] & 0x00000100) <<17);
    output[ 0] |= ((input[ 5] & 0x00000100) <<18);
    output[ 0] |= ((other[ 5] & 0x00000100) <<19);
    output[ 0] |= ((input[ 6] & 0x00000100) <<20);
    output[ 0] |= ((other[ 6] & 0x00000100) <<21);
    output[ 0] |= ((input[ 7] & 0x00000100) <<22);
    output[ 0] |= ((other[ 7] & 0x00000100) <<23);
    output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
    output[ 1] |=  (other[ 0] & 0x00000002);
    output[ 1] |= ((input[ 1] & 0x00000002) << 1);
    output[ 1] |= ((other[ 1] & 0x00000002) << 2);
    output[ 1] |= ((input[ 2] & 0x00000002) << 3);
    output[ 1] |= ((other[ 2] & 0x00000002) << 4);
    output[ 1] |= ((input[ 3] & 0x00000002) << 5);
    output[ 1] |= ((other[ 3] & 0x00000002) << 6);
    output[ 1] |= ((input[ 4] & 0x00000002) << 7);
    output[ 1] |= ((other[ 4] & 0x00000002) << 8);
    output[ 1] |= ((input[ 5] & 0x00000002) << 9);
    output[ 1] |= ((other[ 5] & 0x00000002) <<10);
    output[ 1] |= ((input[ 6] & 0x00000002) <<11);
    output[ 1] |= ((other[ 6] & 0x00000002) <<12);
    output[ 1] |= ((input[ 7] & 0x00000002) <<13);
    output[ 1] |= ((other[ 7] & 0x00000002) <<14);
    output[ 1] |= ((input[ 0] & 0x00000200) << 7);
    output[ 1] |= ((other[ 0] & 0x00000200) << 8);
    output[ 1] |= ((input[ 1] & 0x00000200) << 9);
    output[ 1] |= ((other[ 1] & 0x00000200) <<10);
    output[ 1] |= ((input[ 2] & 0x00000200) <<11);
    output[ 1] |= ((other[ 2] & 0x00000200) <<12);
    output[ 1] |= ((input[ 3] & 0x00000200) <<13);
    output[ 1] |= ((other[ 3] & 0x00000200) <<14);
    output[ 1] |= ((input[ 4] & 0x00000200) <<15);
    output[ 1] |= ((other[ 4] & 0x00000200) <<16);
    output[ 1] |= ((input[ 5] & 0x00000200) <<17);
    output[ 1] |= ((other[ 5] & 0x00000200) <<18);
    output[ 1] |= ((input[ 6] & 0x00000200) <<19);
    output[ 1] |= ((other[ 6] & 0x00000200) <<20);
    output[ 1] |= ((input[ 7] & 0x00000200) <<21);
    output[ 1] |= ((other[ 7] & 0x00000200) <<22);
    output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
    output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
    output[ 2] |=  (input[ 1] & 0x00000004);
    output[ 2] |= ((other[ 1] & 0x00000004) << 1);
    output[ 2] |= ((input[ 2] & 0x00000004) << 2);
    output[ 2] |= ((other[ 2] & 0x00000004) << 3);
    output[ 2] |= ((input[ 3] & 0x00000004) << 4);
    output[ 2] |= ((other[ 3] & 0x00000004) << 5);
    output[ 2] |= ((input[ 4] & 0x00000004) << 6);
    output[ 2] |= ((other[ 4] & 0x00000004) << 7);
    output[ 2] |= ((input[ 5] & 0x00000004) << 8);
    output[ 2] |= ((other[ 5] & 0x00000004) << 9);
    output[ 2] |= ((input[ 6] & 0x00000004) <<10);
    output[ 2] |= ((other[ 6] & 0x00000004) <<11);
    output[ 2] |= ((input[ 7] & 0x00000004) <<12);
    output[ 2] |= ((other[ 7] & 0x00000004) <<13);
    output[ 2] |= ((input[ 0] & 0x00000400) << 6);
    output[ 2] |= ((other[ 0] & 0x00000400) << 7);
    output[ 2] |= ((input[ 1] & 0x00000400) << 8);
    output[ 2] |= ((other[ 1] & 0x00000400) << 9);
    output[ 2] |= ((input[ 2] & 0x00000400) <<10);
    output[ 2] |= ((other[ 2] & 0x00000400) <<11);
    output[ 2] |= ((input[ 3] & 0x00000400) <<12);
    output[ 2] |= ((other[ 3] & 0x00000400) <<13);
    output[ 2] |= ((input[ 4] & 0x00000400) <<14);
    output[ 2] |= ((other[ 4] & 0x00000400) <<15);
    output[ 2] |= ((input[ 5] & 0x00000400) <<16);
    output[ 2] |= ((other[ 5] & 0x00000400) <<17);
    output[ 2] |= ((input[ 6] & 0x00000400) <<18);
    output[ 2] |= ((other[ 6] & 0x00000400) <<19);
    output[ 2] |= ((input[ 7] & 0x00000400) <<20);
    output[ 2] |= ((other[ 7] & 0x00000400) <<21);
    output[ 3] |= ((input[ 0] & 0x00000008) >> 3);
    output[ 3] |= ((other[ 0] & 0x00000008) >> 2);
    output[ 3] |= ((input[ 1] & 0x00000008) >> 1);
    output[ 3] |=  (other[ 1] & 0x00000008);
    output[ 3] |= ((input[ 2] & 0x00000008) << 1);
    output[ 3] |= ((other[ 2] & 0x00000008) << 2);
    output[ 3] |= ((input[ 3] & 0x00000008) << 3);
    output[ 3] |= ((other[ 3] & 0x00000008) << 4);
    output[ 3] |= ((input[ 4] & 0x00000008) << 5);
    output[ 3] |= ((other[ 4] & 0x00000008) << 6);
    output[ 3] |= ((input[ 5] & 0x00000008) << 7);
    output[ 3] |= ((other[ 5] & 0x00000008) << 8);
    output[ 3] |= ((input[ 6] & 0x00000008) << 9);
    output[ 3] |= ((other[ 6] & 0x00000008) <<10);
    output[ 3] |= ((input[ 7] & 0x00000008) <<11);
    output[ 3] |= ((other[ 7] & 0x00000008) <<12);
    output[ 3] |= ((input[ 0] & 0x00000800) << 5);
    output[ 3] |= ((other[ 0] & 0x00000800) << 6);
    output[ 3] |= ((input[ 1] & 0x00000800) << 7);
    output[ 3] |= ((other[ 1] & 0x00000800) << 8);
    output[ 3] |= ((input[ 2] & 0x00000800) << 9);
    output[ 3] |= ((other[ 2] & 0x00000800) <<10);
    output[ 3] |= ((input[ 3] & 0x00000800) <<11);
    output[ 3] |= ((other[ 3] & 0x00000800) <<12);
    output[ 3] |= ((input[ 4] & 0x00000800) <<13);
    output[ 3] |= ((other[ 4] & 0x00000800) <<14);
    output[ 3] |= ((input[ 5] & 0x00000800) <<15);
    output[ 3] |= ((other[ 5] & 0x00000800) <<16);
    output[ 3] |= ((input[ 6] & 0x00000800) <<17);
    output[ 3] |= ((other[ 6] & 0x00000800) <<18);
    output[ 3] |= ((input[ 7] & 0x00000800) <<19);
    output[ 3] |= ((other[ 7] & 0x00000800) <<20);
    output[ 4] |= ((input[ 0] & 0x00000010) >> 4);
    output[ 4] |= ((other[ 0] & 0x00000010) >> 3);
    output[ 4] |= ((input[ 1] & 0x00000010) >> 2);
    output[ 4] |= ((other[ 1] & 0x00000010) >> 1);
    output[ 4] |=  (input[ 2] & 0x00000010);
    output[ 4] |= ((other[ 2] & 0x00000010) << 1);
    output[ 4] |= ((input[ 3] & 0x00000010) << 2);
    output[ 4] |= ((other[ 3] & 0x00000010) << 3);
    output[ 4] |= ((input[ 4] & 0x00000010) << 4);
    output[ 4] |= ((other[ 4] & 0x00000010) << 5);
    output[ 4] |= ((input[ 5] & 0x00000010) << 6);
    output[ 4] |= ((other[ 5] & 0x00000010) << 7);
    output[ 4] |= ((input[ 6] & 0x00000010) << 8);
    output[ 4] |= ((other[ 6] & 0x00000010) << 9);
    output[ 4] |= ((input[ 7] & 0x00000010) <<10);
    output[ 4] |= ((other[ 7] & 0x00000010) <<11);
    output[ 4] |= ((input[ 0] & 0x00001000) << 4);
    output[ 4] |= ((other[ 0] & 0x00001000) << 5);
    output[ 4] |= ((input[ 1] & 0x00001000) << 6);
    output[ 4] |= ((other[ 1] & 0x00001000) << 7);
    output[ 4] |= ((input[ 2] & 0x00001000) << 8);
    output[ 4] |= ((other[ 2] & 0x00001000) << 9);
    output[ 4] |= ((input[ 3] & 0x00001000) <<10);
    output[ 4] |= ((other[ 3] & 0x00001000) <<11);
    output[ 4] |= ((input[ 4] & 0x00001000) <<12);
    output[ 4] |= ((other[ 4] & 0x00001000) <<13);
    output[ 4] |= ((input[ 5] & 0x00001000) <<14);
    output[ 4] |= ((other[ 5] & 0x00001000) <<15);
    output[ 4] |= ((input[ 6] & 0x00001000) <<16);
    output[ 4] |= ((other[ 6] & 0x00001000) <<17);
    output[ 4] |= ((input[ 7] & 0x00001000) <<18);
    output[ 4] |= ((other[ 7] & 0x00001000) <<19);
    output[ 5] |= ((input[ 0] & 0x00000020) >> 5);
    output[ 5] |= ((other[ 0] & 0x00000020) >> 4);
    output[ 5] |= ((input[ 1] & 0x00000020) >> 3);
    output[ 5] |= ((other[ 1] & 0x00000020) >> 2);
    output[ 5] |= ((input[ 2] & 0x00000020) >> 1);
    output[ 5] |=  (other[ 2] & 0x00000020);
    output[ 5] |= ((input[ 3] & 0x00000020) << 1);
    output[ 5] |= ((other[ 3] & 0x00000020) << 2);
    output[ 5] |= ((input[ 4] & 0x00000020) << 3);
    output[ 5] |= ((other[ 4] & 0x00000020) << 4);
    output[ 5] |= ((input[ 5] & 0x00000020) << 5);
    output[ 5] |= ((other[ 5] & 0x00000020) << 6);
    output[ 5] |= ((input[ 6] & 0x00000020) << 7);
    output[ 5] |= ((other[ 6] & 0x00000020) << 8);
    output[ 5] |= ((input[ 7] & 0x00000020) << 9);
    output[ 5] |= ((other[ 7] & 0x00000020) <<10);
    output[ 5] |= ((input[ 0] & 0x00002000) << 3);
    output[ 5] |= ((other[ 0] & 0x00002000) << 4);
    output[ 5] |= ((input[ 1] & 0x00002000) << 5);
    output[ 5] |= ((other[ 1] & 0x00002000) << 6);
    output[ 5] |= ((input[ 2] & 0x00002000) << 7);
    output[ 5] |= ((other[ 2] & 0x00002000) << 8);
    output[ 5] |= ((input[ 3] & 0x00002000) << 9);
    output[ 5] |= ((other[ 3] & 0x00002000) <<10);
    output[ 5] |= ((input[ 4] & 0x00002000) <<11);
    output[ 5] |= ((other[ 4] & 0x00002000) <<12);
    output[ 5] |= ((input[ 5] & 0x00002000) <<13);
    output[ 5] |= ((other[ 5] & 0x00002000) <<14);
    output[ 5] |= ((input[ 6] & 0x00002000) <<15);
    output[ 5] |= ((other[ 6] & 0x00002000) <<16);
    output[ 5] |= ((input[ 7] & 0x00002000) <<17);
    output[ 5] |= ((other[ 7] & 0x00002000) <<18);
    output[ 6] |= ((input[ 0] & 0x00000040) >> 6);
    output[ 6] |= ((other[ 0] & 0x00000040) >> 5);
    output[ 6] |= ((input[ 1] & 0x00000040) >> 4);
    output[ 6] |= ((other[ 1] & 0x00000040) >> 3);
    output[ 6] |= ((input[ 2] & 0x00000040) >> 2);
    output[ 6] |= ((other[ 2] & 0x00000040) >> 1);
    output[ 6] |=  (input[ 3] & 0x00000040);
    output[ 6] |= ((other[ 3] & 0x00000040) << 1);
    output[ 6] |= ((input[ 4] & 0x00000040) << 2);
    output[ 6] |= ((other[ 4] & 0x00000040) << 3);
    output[ 6] |= ((input[ 5] & 0x00000040) << 4);
    output[ 6] |= ((other[ 5] & 0x00000040) << 5);
    output[ 6] |= ((input[ 6] & 0x00000040) << 6);
    output[ 6] |= ((other[ 6] & 0x00000040) << 7);
    output[ 6] |= ((input[ 7] & 0x00000040) << 8);
    output[ 6] |= ((other[ 7] & 0x00000040) << 9);
    output[ 6] |= ((input[ 0] & 0x00004000) << 2);
    output[ 6] |= ((other[ 0] & 0x00004000) << 3);
    output[ 6] |= ((input[ 1] & 0x00004000) << 4);
    output[ 6] |= ((other[ 1] & 0x00004000) << 5);
    output[ 6] |= ((input[ 2] & 0x00004000) << 6);
    output[ 6] |= ((other[ 2] & 0x00004000) << 7);
    output[ 6] |= ((input[ 3] & 0x00004000) << 8);
    output[ 6] |= ((other[ 3] & 0x00004000) << 9);
    output[ 6] |= ((input[ 4] & 0x00004000) <<10);
    output[ 6] |= ((other[ 4] & 0x00004000) <<11);
    output[ 6] |= ((input[ 5] & 0x00004000) <<12);
    output[ 6] |= ((other[ 5] & 0x00004000) <<13);
    output[ 6] |= ((input[ 6] & 0x00004000) <<14);
    output[ 6] |= ((other[ 6] & 0x00004000) <<15);
    output[ 6] |= ((input[ 7] & 0x00004000) <<16);
    output[ 6] |= ((other[ 7] & 0x00004000) <<17);
    output[ 7] |= ((input[ 0] & 0x00000080) >> 7);
    output[ 7] |= ((other[ 0] & 0x00000080) >> 6);
    output[ 7] |= ((input[ 1] & 0x00000080) >> 5);
    output[ 7] |= ((other[ 1] & 0x00000080) >> 4);
    output[ 7] |= ((input[ 2] & 0x00000080) >> 3);
    output[ 7] |= ((other[ 2] & 0x00000080) >> 2);
    output[ 7] |= ((input[ 3] & 0x00000080) >> 1);
    output[ 7] |=  (other[ 3] & 0x00000080);
    output[ 7] |= ((input[ 4] & 0x00000080) << 1);
    output[ 7] |= ((other[ 4] & 0x00000080) << 2);
    output[ 7] |= ((input[ 5] & 0x00000080) << 3);
    output[ 7] |= ((other[ 5] & 0x00000080) << 4);
    output[ 7] |= ((input[ 6] & 0x00000080) << 5);
    output[ 7] |= ((other[ 6] & 0x00000080) << 6);
    output[ 7] |= ((input[ 7] & 0x00000080) << 7);
    output[ 7] |= ((other[ 7] & 0x00000080) << 8);
    output[ 7] |= ((input[ 0] & 0x00008000) << 1);
    output[ 7] |= ((other[ 0] & 0x00008000) << 2);
    output[ 7] |= ((input[ 1] & 0x00008000) << 3);
    output[ 7] |= ((other[ 1] & 0x00008000) << 4);
    output[ 7] |= ((input[ 2] & 0x00008000) << 5);
    output[ 7] |= ((other[ 2] & 0x00008000) << 6);
    output[ 7] |= ((input[ 3] & 0x00008000) << 7);
    output[ 7] |= ((other[ 3] & 0x00008000) << 8);
    output[ 7] |= ((input[ 4] & 0x00008000) << 9);
    output[ 7] |= ((other[ 4] & 0x00008000) <<10);
    output[ 7] |= ((input[ 5] & 0x00008000) <<11);
    output[ 7] |= ((other[ 5] & 0x00008000) <<12);
    output[ 7] |= ((input[ 6] & 0x00008000) <<13);
    output[ 7] |= ((other[ 6] & 0x00008000) <<14);
    output[ 7] |= ((input[ 7] & 0x00008000) <<15);
    output[ 7] |= ((other[ 7] & 0x00008000) <<16);
 }
 __device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
 {
 #pragma unroll 8
    for (int i=0; i < 16; i+=2) output[i] = 0;
    output[ 0] |= ((input[ 0] & 0x00000100) >> 8);
    output[ 0] |= ((input[ 1] & 0x00000100) >> 7);
    output[ 0] |= ((input[ 2] & 0x00000100) >> 6);
    output[ 0] |= ((input[ 3] & 0x00000100) >> 5);
    output[ 0] |= ((input[ 4] & 0x00000100) >> 4);
    output[ 0] |= ((input[ 5] & 0x00000100) >> 3);
    output[ 0] |= ((input[ 6] & 0x00000100) >> 2);
    output[ 0] |= ((input[ 7] & 0x00000100) >> 1);
    output[ 0] |= ((input[ 0] & 0x01000000) >>16);
    output[ 0] |= ((input[ 1] & 0x01000000) >>15);
    output[ 0] |= ((input[ 2] & 0x01000000) >>14);
    output[ 0] |= ((input[ 3] & 0x01000000) >>13);
    output[ 0] |= ((input[ 4] & 0x01000000) >>12);
    output[ 0] |= ((input[ 5] & 0x01000000) >>11);
    output[ 0] |= ((input[ 6] & 0x01000000) >>10);
    output[ 0] |= ((input[ 7] & 0x01000000) >> 9);
    output[ 2] |= ((input[ 0] & 0x00000200) >> 9);
    output[ 2] |= ((input[ 1] & 0x00000200) >> 8);
    output[ 2] |= ((input[ 2] & 0x00000200) >> 7);
    output[ 2] |= ((input[ 3] & 0x00000200) >> 6);
    output[ 2] |= ((input[ 4] & 0x00000200) >> 5);
    output[ 2] |= ((input[ 5] & 0x00000200) >> 4);
    output[ 2] |= ((input[ 6] & 0x00000200) >> 3);
    output[ 2] |= ((input[ 7] & 0x00000200) >> 2);
    output[ 2] |= ((input[ 0] & 0x02000000) >>17);
    output[ 2] |= ((input[ 1] & 0x02000000) >>16);
    output[ 2] |= ((input[ 2] & 0x02000000) >>15);
    output[ 2] |= ((input[ 3] & 0x02000000) >>14);
    output[ 2] |= ((input[ 4] & 0x02000000) >>13);
    output[ 2] |= ((input[ 5] & 0x02000000) >>12);
    output[ 2] |= ((input[ 6] & 0x02000000) >>11);
    output[ 2] |= ((input[ 7] & 0x02000000) >>10);
    output[ 4] |= ((input[ 0] & 0x00000400) >>10);
    output[ 4] |= ((input[ 1] & 0x00000400) >> 9);
    output[ 4] |= ((input[ 2] & 0x00000400) >> 8);
    output[ 4] |= ((input[ 3] & 0x00000400) >> 7);
    output[ 4] |= ((input[ 4] & 0x00000400) >> 6);
    output[ 4] |= ((input[ 5] & 0x00000400) >> 5);
    output[ 4] |= ((input[ 6] & 0x00000400) >> 4);
    output[ 4] |= ((input[ 7] & 0x00000400) >> 3);
    output[ 4] |= ((input[ 0] & 0x04000000) >>18);
    output[ 4] |= ((input[ 1] & 0x04000000) >>17);
    output[ 4] |= ((input[ 2] & 0x04000000) >>16);
    output[ 4] |= ((input[ 3] & 0x04000000) >>15);
    output[ 4] |= ((input[ 4] & 0x04000000) >>14);
    output[ 4] |= ((input[ 5] & 0x04000000) >>13);
    output[ 4] |= ((input[ 6] & 0x04000000) >>12);
    output[ 4] |= ((input[ 7] & 0x04000000) >>11);
    output[ 6] |= ((input[ 0] & 0x00000800) >>11);
    output[ 6] |= ((input[ 1] & 0x00000800) >>10);
    output[ 6] |= ((input[ 2] & 0x00000800) >> 9);
    output[ 6] |= ((input[ 3] & 0x00000800) >> 8);
    output[ 6] |= ((input[ 4] & 0x00000800) >> 7);
    output[ 6] |= ((input[ 5] & 0x00000800) >> 6);
    output[ 6] |= ((input[ 6] & 0x00000800) >> 5);
    output[ 6] |= ((input[ 7] & 0x00000800) >> 4);
    output[ 6] |= ((input[ 0] & 0x08000000) >>19);
    output[ 6] |= ((input[ 1] & 0x08000000) >>18);
    output[ 6] |= ((input[ 2] & 0x08000000) >>17);
    output[ 6] |= ((input[ 3] & 0x08000000) >>16);
    output[ 6] |= ((input[ 4] & 0x08000000) >>15);
    output[ 6] |= ((input[ 5] & 0x08000000) >>14);
    output[ 6] |= ((input[ 6] & 0x08000000) >>13);
    output[ 6] |= ((input[ 7] & 0x08000000) >>12);
    output[ 8] |= ((input[ 0] & 0x00001000) >>12);
    output[ 8] |= ((input[ 1] & 0x00001000) >>11);
    output[ 8] |= ((input[ 2] & 0x00001000) >>10);
    output[ 8] |= ((input[ 3] & 0x00001000) >> 9);
    output[ 8] |= ((input[ 4] & 0x00001000) >> 8);
    output[ 8] |= ((input[ 5] & 0x00001000) >> 7);
    output[ 8] |= ((input[ 6] & 0x00001000) >> 6);
    output[ 8] |= ((input[ 7] & 0x00001000) >> 5);
    output[ 8] |= ((input[ 0] & 0x10000000) >>20);
    output[ 8] |= ((input[ 1] & 0x10000000) >>19);
    output[ 8] |= ((input[ 2] & 0x10000000) >>18);
    output[ 8] |= ((input[ 3] & 0x10000000) >>17);
    output[ 8] |= ((input[ 4] & 0x10000000) >>16);
    output[ 8] |= ((input[ 5] & 0x10000000) >>15);
    output[ 8] |= ((input[ 6] & 0x10000000) >>14);
    output[ 8] |= ((input[ 7] & 0x10000000) >>13);
    output[10] |= ((input[ 0] & 0x00002000) >>13);
    output[10] |= ((input[ 1] & 0x00002000) >>12);
    output[10] |= ((input[ 2] & 0x00002000) >>11);
    output[10] |= ((input[ 3] & 0x00002000) >>10);
    output[10] |= ((input[ 4] & 0x00002000) >> 9);
    output[10] |= ((input[ 5] & 0x00002000) >> 8);
    output[10] |= ((input[ 6] & 0x00002000) >> 7);
    output[10] |= ((input[ 7] & 0x00002000) >> 6);
    output[10] |= ((input[ 0] & 0x20000000) >>21);
    output[10] |= ((input[ 1] & 0x20000000) >>20);
    output[10] |= ((input[ 2] & 0x20000000) >>19);
    output[10] |= ((input[ 3] & 0x20000000) >>18);
    output[10] |= ((input[ 4] & 0x20000000) >>17);
    output[10] |= ((input[ 5] & 0x20000000) >>16);
    output[10] |= ((input[ 6] & 0x20000000) >>15);
    output[10] |= ((input[ 7] & 0x20000000) >>14);
    output[12] |= ((input[ 0] & 0x00004000) >>14);
    output[12] |= ((input[ 1] & 0x00004000) >>13);
    output[12] |= ((input[ 2] & 0x00004000) >>12);
    output[12] |= ((input[ 3] & 0x00004000) >>11);
    output[12] |= ((input[ 4] & 0x00004000) >>10);
    output[12] |= ((input[ 5] & 0x00004000) >> 9);
    output[12] |= ((input[ 6] & 0x00004000) >> 8);
    output[12] |= ((input[ 7] & 0x00004000) >> 7);
    output[12] |= ((input[ 0] & 0x40000000) >>22);
    output[12] |= ((input[ 1] & 0x40000000) >>21);
    output[12] |= ((input[ 2] & 0x40000000) >>20);
    output[12] |= ((input[ 3] & 0x40000000) >>19);
    output[12] |= ((input[ 4] & 0x40000000) >>18);
    output[12] |= ((input[ 5] & 0x40000000) >>17);
    output[12] |= ((input[ 6] & 0x40000000) >>16);
    output[12] |= ((input[ 7] & 0x40000000) >>15);
    output[14] |= ((input[ 0] & 0x00008000) >>15);
    output[14] |= ((input[ 1] & 0x00008000) >>14);
    output[14] |= ((input[ 2] & 0x00008000) >>13);
    output[14] |= ((input[ 3] & 0x00008000) >>12);
    output[14] |= ((input[ 4] & 0x00008000) >>11);
    output[14] |= ((input[ 5] & 0x00008000) >>10);
    output[14] |= ((input[ 6] & 0x00008000) >> 9);
    output[14] |= ((input[ 7] & 0x00008000) >> 8);
    output[14] |= ((input[ 0] & 0x80000000) >>23);
    output[14] |= ((input[ 1] & 0x80000000) >>22);
    output[14] |= ((input[ 2] & 0x80000000) >>21);
    output[14] |= ((input[ 3] & 0x80000000) >>20);
    output[14] |= ((input[ 4] & 0x80000000) >>19);
    output[14] |= ((input[ 5] & 0x80000000) >>18);
    output[14] |= ((input[ 6] & 0x80000000) >>17);
    output[14] |= ((input[ 7] & 0x80000000) >>16);
 #pragma unroll 8
    for (int i = 0; i < 16; i+=2) {
        if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
        output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
        output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
        if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
    }
 }
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -287,6 +287,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
    <ClInclude Include="uint256.h" />
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="bitslice_transformations_quad.cu">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
    </CudaCompile>
    <CudaCompile Include="cuda_fugue256.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -311,6 +317,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <CudaCompile Include="groestl_functions_quad.cu">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
    </CudaCompile>
    <CudaCompile Include="heavy\cuda_blake512.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -340,5 +340,11 @@
    <CudaCompile Include="x11\simd_functions.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
    <CudaCompile Include="bitslice_transformations_quad.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="groestl_functions_quad.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.05.10])
+AC_INIT([ccminer], [2014.06.14])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -917,8 +917,8 @@ static void *miner_thread(void *userdata)
 			goto out;
 		}
-        if (opt_benchmark)
+//        if (opt_benchmark)
-            if (++rounds == 1) exit(0);
+//            if (++rounds == 1) exit(0);
 		/* record scanhash elapsed time */
 		gettimeofday(&tv_end, NULL);
@ -1469,7 +1469,7 @@ static void signal_handler(int sig)
 }
 #endif
-#define PROGRAM_VERSION "1.0"
+#define PROGRAM_VERSION "1.1"
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -152,7 +152,7 @@
 #define PACKAGE_NAME "ccminer"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.05.10"
+#define PACKAGE_STRING "ccminer 2014.06.14"
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
 #undef PACKAGE_URL
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.05.10"
+#define PACKAGE_VERSION "2014.06.14"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -1,4 +1,4 @@
-// Auf Groestlcoin spezialisierte Version von Groestl
+// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
 #include <cuda.h>
 #include "cuda_runtime.h"
@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 // it's unfortunate that this is a compile time constant.
 #define MAXWELL_OR_FERMI 1
 // aus cpu-miner.c
 extern int device_map[8];
@ -18,456 +15,152 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 // diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props;
+static cudaDeviceProp props[8];
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 extern uint32_t *d_resultNonce[8];
 __constant__ uint32_t groestlcoin_gpu_msg[32];
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+// 64 Register Variante für Compute 3.0
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#include "groestl_functions_quad.cu"
-
+#include "bitslice_transformations_quad.cu"
-#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+
-#define PC32dn(j, r)   0
+#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-#define QC32up(j, r)   0xFFFFFFFF
+
-#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+__global__ void __launch_bounds__(256, 4)
-
+ groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
 #define B32_0(x)    __byte_perm(x, 0, 0x4440)
 //((x) & 0xFF)
 #define B32_1(x)    __byte_perm(x, 0, 0x4441)
 //(((x) >> 8) & 0xFF)
 #define B32_2(x)    __byte_perm(x, 0, 0x4442)
 //(((x) >> 16) & 0xFF)
 #define B32_3(x)    __byte_perm(x, 0, 0x4443)
 //((x) >> 24)
 #if MAXWELL_OR_FERMI
 #define USE_SHARED 1
 // Maxwell and Fermi cards get the best speed with SHARED access it seems.
 #if USE_SHARED
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
 #else
 #define T0up(x) tex1Dfetch(t0up1, x)
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) tex1Dfetch(t1dn1, x)
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) tex1Dfetch(t2dn1, x)
 #define T3up(x) tex1Dfetch(t3up1, x)
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 #else
 #define USE_SHARED 1
 // a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 texture<unsigned int, 1, cudaReadModeElementType> t0up1;
 texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t1up1;
 texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t2up1;
 texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t3up1;
 texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
 extern uint32_t T0up_cpu[];
 extern uint32_t T0dn_cpu[];
 extern uint32_t T1up_cpu[];
 extern uint32_t T1dn_cpu[];
 extern uint32_t T2up_cpu[];
 extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 __device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
 {
-	uint32_t t[32];
+    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
-//#pragma unroll 14
+    if (thread < threads)
-	for(int r=0;r<14;r++)
+    {
-	{
+        // GROESTL
-		switch(r)
+        uint32_t paddedInput[8];
-		{
+#pragma unroll 8
-			case 0:
+        for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
 			case 1:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
 			case 2:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
 			case 3:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
 			case 4:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
 			case 5:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
 			case 6:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
 			case 7:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
 			case 8:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
 			case 9:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
 			case 10:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
 			case 11:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
 			case 12:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
 			case 13:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
 		}
        // RBTT
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
+        uint32_t nounce = startNounce + thread;
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+        if ((threadIdx.x % 4) == 3)
            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
+        uint32_t msgBitsliced[8];
-                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
+        to_bitslice_quad(paddedInput, msgBitsliced);
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
-__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
+        uint32_t state[8];
-{	
+        for (int round=0; round<2; round++)
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
 		uint32_t t[32];
 		switch(r)
 		{
 			case 0:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
 			case 1:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
 			case 2:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
 			case 3:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
 			case 4:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
 			case 5:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
 			case 6:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
 			case 7:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
 			case 8:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
 			case 9:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
 			case 10:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
 			case 11:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
 			case 12:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
 			case 13:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
 		}
        // RBTT
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
-            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
+            groestl512_progressMessage_quad(state, msgBitsliced);
-            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
+
-            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
+            if (round < 1)
-            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
+            {
                // Verkettung zweier Runden inclusive Padding.
                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
            }
        }
-            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
+        uint32_t out_state[16];
        from_bitslice_quad(state, out_state);
-            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
+        if (threadIdx.x % 4 == 0)
-                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+        {
            int i, position = -1;
            bool rc = true;
    #pragma unroll 8
            for (i = 7; i >= 0; i--) {
                if (out_state[i] > pTarget[i]) {
                    if(position < i) {
                        position = i;
                        rc = false;
                    }
                 }
                 if (out_state[i] < pTarget[i]) {
                    if(position < i) {
                        position = i;
                        rc = true;
                    }
                 }
            }
            if(rc == true)
                if(resNounce[0] > nounce)
                    resNounce[0] = nounce;
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
 #if USE_SHARED
 __global__ void  /* __launch_bounds__(256) */
 #else
 __global__ void 
 #endif
 groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
 	if (threadIdx.x < 256)
 	{
 		*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
 	}
 	__syncthreads();
 #endif
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// GROESTL
 		uint32_t message[32];
 		uint32_t state[32];
 #pragma unroll 32
 		for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
 		uint32_t nounce = startNounce + thread;
 		message[19] = SWAB32(nounce);
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] = message[u];
 		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
 		groestlcoin_perm_P(state, mixtabs);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
 		groestlcoin_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 #pragma unroll 32
 		for(int u=0;u<32;u++) message[u] = state[u];
 #if USE_SHARED
 		groestlcoin_perm_P(message, mixtabs);
 #else
 		groestlcoin_perm_P(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 		////
 		//// 2. Runde groestl
 		////
 #pragma unroll 16
 		for(int k=0;k<16;k++) message[k] = state[k + 16];
 #pragma unroll 14
 		for(int k=1;k<15;k++)
 			message[k+16] = 0;
 		message[16] = 0x80;
 		message[31] = 0x01000000;
 #pragma unroll 32
 		for(int u=0;u<32;u++)
 			state[u] = message[u];
 		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
 		groestlcoin_perm_P(state, mixtabs);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
 		groestlcoin_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 #pragma unroll 32
 		for(int u=0;u<32;u++) message[u] = state[u];
 #if USE_SHARED
 		groestlcoin_perm_P(message, mixtabs);
 #else
 		groestlcoin_perm_P(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 		// kopiere Ergebnis
 		int i, position = -1;
 		bool rc = true;
 #pragma unroll 8
 		for (i = 7; i >= 0; i--) {
 			if (state[i+16] > pTarget[i]) {
 				if(position < i) {
 					position = i;
 					rc = false;
 				}
 	 		}
 	 		if (state[i+16] < pTarget[i]) {
 				if(position < i) {
 					position = i;
 					rc = true;
 				}
 	 		}
 		}
 		if(rc == true)
 			if(resNounce[0] > nounce)
 				resNounce[0] = nounce;
 	}
 }
 #define texDef(texname, texmem, texsource, texsize) \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
 // Setup-Funktionen
 __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(device_map[thr_id]);
+    cudaSetDevice(device_map[thr_id]);
-
+
-	cudaGetDeviceProperties(&props, device_map[thr_id]);
+    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
-
+
-	// Texturen mit obigem Makro initialisieren
+    // Speicher für Gewinner-Nonce belegen
-	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
 	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
 	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
 	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
 	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
 	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
 	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
 	// Speicher für Gewinner-Nonce belegen
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 }
 __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
-	// Nachricht expandieren und setzen
+    // Nachricht expandieren und setzen
-	uint32_t msgBlock[32];
+    uint32_t msgBlock[32];
-	memset(msgBlock, 0, sizeof(uint32_t) * 32);
+    memset(msgBlock, 0, sizeof(uint32_t) * 32);
-	memcpy(&msgBlock[0], data, 80);
+    memcpy(&msgBlock[0], data, 80);
-	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
+    // Erweitere die Nachricht auf den Nachrichtenblock (padding)
-	// Unsere Nachricht hat 80 Byte
+    // Unsere Nachricht hat 80 Byte
-	msgBlock[20] = 0x80;
+    msgBlock[20] = 0x80;
-	msgBlock[31] = 0x01000000;
+    msgBlock[31] = 0x01000000;
-	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
+    // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
-	// auf der GPU ausgeführt)
+    // auf der GPU ausgeführt)
-	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
+    // Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
-	cudaMemcpyToSymbol(	groestlcoin_gpu_msg,
+    cudaMemcpyToSymbol( groestlcoin_gpu_msg,
-						msgBlock,
+                        msgBlock,
-						128);
+                        128);
-	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	cudaMemcpyToSymbol(	pTarget,
+    cudaMemcpyToSymbol( pTarget,
-						pTargetIn,
+                        pTargetIn,
-						sizeof(uint32_t) * 8 );
+                        sizeof(uint32_t) * 8 );
 }
 __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
-	// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
+    int threadsperblock = 256;
-	// alle anderen mit 512 Threads.
+
-	int threadsperblock = (props.major >= 3) ? 768 : 512;
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-
+    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-	// berechne wie viele Thread Blocks wir brauchen
+    int factor = 4;
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+
-	dim3 block(threadsperblock);
+        // berechne wie viele Thread Blocks wir brauchen
-
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
-	// Größe des dynamischen Shared Memory Bereichs
+    dim3 block(threadsperblock);
-#if USE_SHARED
+
-	size_t shared_size = 8 * 256 * sizeof(uint32_t);
+    // Größe des dynamischen Shared Memory Bereichs
-#else
+    size_t shared_size = 0;
-	size_t shared_size = 0;
+
-#endif
+    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-
+    groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+
-	//fprintf(stderr, "ThrID: %d\n", thr_id);
+    // Strategisches Sleep Kommando zur Senkung der CPU Last
-	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+    MyStreamSynchronize(NULL, 0, thr_id);
-	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+
-
+    cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	MyStreamSynchronize(NULL, 0, thr_id);
 	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@ -1,4 +1,4 @@
-// Auf Myriadcoin spezialisierte Version von Groestl
+// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
 #include <cuda.h>
 #include "cuda_runtime.h"
@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 // it's unfortunate that this is a compile time constant.
 #define MAXWELL_OR_FERMI 1
 // aus cpu-miner.c
 extern int device_map[8];
@ -22,30 +19,49 @@ typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 // diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props;
+static cudaDeviceProp props[8];
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 uint32_t *d_outputHashes[8];
 extern uint32_t *d_resultNonce[8];
 __constant__ uint32_t myriadgroestl_gpu_msg[32];
 // muss expandiert werden
 __constant__ uint32_t myr_sha256_gpu_constantTable[64];
 __constant__ uint32_t myr_sha256_gpu_constantTable2[64];
 __constant__ uint32_t myr_sha256_gpu_hashTable[8];
 uint32_t myr_sha256_cpu_hashTable[] = { 
-	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
+    0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
 uint32_t myr_sha256_cpu_constantTable[] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 };
 uint32_t myr_sha256_cpu_w2Table[] = {
    0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
    0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
    0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
    0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
    0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
    0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
    0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 };
 // 64 Register Variante für Compute 3.0
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"
 #define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 #if __CUDA_ARCH__ < 350 
    // Kepler (Compute 3.0)
    #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
@ -53,452 +69,219 @@ uint32_t myr_sha256_cpu_constantTable[] = {
    // Kepler (Compute 3.5)
    #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
 #endif
-#define R(x, n)			((x) >> (n))
+#define R(x, n)            ((x) >> (n))
-#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
+#define Ch(x, y, z)        ((x & (y ^ z)) ^ z)
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define Maj(x, y, z)    ((x & (y | z)) | (y & z))
-#define S0(x)			(ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
+#define S0(x)            (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
-#define S1(x)			(ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
+#define S1(x)            (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
-#define s0(x)			(ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
+#define s0(x)            (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
-#define s1(x)			(ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
+#define s1(x)            (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
 {
-	uint32_t W1[16];
+    uint32_t W1[16];
-	uint32_t W2[16];
+    uint32_t W2[16];
-	// Initialisiere die register a bis h mit der Hash-Tabelle
+    // Initialisiere die register a bis h mit der Hash-Tabelle
-	uint32_t regs[8];
+    uint32_t regs[8];
-	uint32_t hash[8];
+    uint32_t hash[8];
-	// pre
+    // pre
 #pragma unroll 8
-	for (int k=0; k < 8; k++)
+    for (int k=0; k < 8; k++)
-	{
+    {
-		regs[k] = myr_sha256_gpu_hashTable[k];
+        regs[k] = myr_sha256_gpu_hashTable[k];
-		hash[k] = regs[k];
+        hash[k] = regs[k];
-	}
+    }
 #pragma unroll 16
-	for(int k=0;k<16;k++)
+    for(int k=0;k<16;k++)
-		W1[k] = SWAB32(message[k]);
+        W1[k] = SWAB32(message[k]);
 // Progress W1
 #pragma unroll 16
-	for(int j=0;j<16;j++)
+    for(int j=0;j<16;j++)
-	{
+    {
-		uint32_t T1, T2;
+        uint32_t T1, T2;
-		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
-		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+        
-		#pragma unroll 7
+        #pragma unroll 7
-		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+        for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-		regs[0] = T1 + T2;
+        regs[0] = T1 + T2;
-		regs[4] += T1;
+        regs[4] += T1;
-	}
+    }
 // Progress W2...W3
-#pragma unroll 3
+////// PART 1
 	for(int k=0;k<3;k++)
 	{
 #pragma unroll 2
-		for(int j=0;j<2;j++)
+    for(int j=0;j<2;j++)
-			W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+        W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 #pragma unroll 5
-		for(int j=2;j<7;j++)
+    for(int j=2;j<7;j++)
-			W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+        W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 #pragma unroll 8
-		for(int j=7;j<15;j++)
+    for(int j=7;j<15;j++)
-			W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+        W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
-		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+    W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
-		// Rundenfunktion
+    // Rundenfunktion
 #pragma unroll 16
-		for(int j=0;j<16;j++)
+    for(int j=0;j<16;j++)
-		{
+    {
-			uint32_t T1, T2;
+        uint32_t T1, T2;
-			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j];
-			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+        
-			#pragma unroll 7
+        #pragma unroll 7
-			for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-			regs[0] = T1 + T2;
+        regs[0] = T1 + T2;
-			regs[4] += T1;
+        regs[4] += T1;
-		}
+    }
 #pragma unroll 16
 		for(int j=0;j<16;j++)
 			W1[j] = W2[j];
 	}
-#pragma unroll 8
+////// PART 2
-	for(int k=0;k<8;k++)
+#pragma unroll 2
-		hash[k] += regs[k];
+    for(int j=0;j<2;j++)
        W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j];
 #pragma unroll 5
    for(int j=2;j<7;j++)
        W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j];
 	/////
 	///// Zweite Runde (wegen Msg-Padding)
 	/////
 #pragma unroll 8
-	for(int k=0;k<8;k++)
+    for(int j=7;j<15;j++)
-		regs[k] = hash[k];
+        W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j];
-	W1[0] = SWAB32(0x80);
+    W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
 #pragma unroll 14
 	for(int k=1;k<15;k++)
 		W1[k] = 0;
 	W1[15] = 512;
-// Progress W1
+    // Rundenfunktion
 #pragma unroll 16
-	for(int j=0;j<16;j++)
+    for(int j=0;j<16;j++)
-	{
+    {
-		uint32_t T1, T2;
+        uint32_t T1, T2;
-		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j];
-		T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+        
-		#pragma unroll 7
+        #pragma unroll 7
-		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-		regs[0] = T1 + T2;
+        regs[0] = T1 + T2;
-		regs[4] += T1;
+        regs[4] += T1;
-	}
+    }
-// Progress W2...W3
+////// PART 3
 #pragma unroll 3
 	for(int k=0;k<3;k++)
 	{
 #pragma unroll 2
-		for(int j=0;j<2;j++)
+    for(int j=0;j<2;j++)
-			W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+        W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 #pragma unroll 5
-		for(int j=2;j<7;j++)
+    for(int j=2;j<7;j++)
-			W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+        W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
 #pragma unroll 8
-		for(int j=7;j<15;j++)
+    for(int j=7;j<15;j++)
-			W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+        W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
-		W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+    W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
-		// Rundenfunktion
+    // Rundenfunktion
 #pragma unroll 16
-		for(int j=0;j<16;j++)
+    for(int j=0;j<16;j++)
-		{
+    {
-			uint32_t T1, T2;
+        uint32_t T1, T2;
-			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j];
-			T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-		
+        
-			#pragma unroll 7
+        #pragma unroll 7
-			for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
+        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
-			regs[0] = T1 + T2;
+        regs[0] = T1 + T2;
-			regs[4] += T1;
+        regs[4] += T1;
-		}
+    }
 #pragma unroll 16
 		for(int j=0;j<16;j++)
 			W1[j] = W2[j];
 	}
 #pragma unroll 8
-	for(int k=0;k<8;k++)
+    for(int k=0;k<8;k++)
-		hash[k] += regs[k];
+        hash[k] += regs[k];
 	//// FERTIG
    /////
    ///// Zweite Runde (wegen Msg-Padding)
    /////
 #pragma unroll 8
-	for(int k=0;k<8;k++)
+    for(int k=0;k<8;k++)
-		message[k] = SWAB32(hash[k]);
+        regs[k] = hash[k];
 }
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+// Progress W1
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#pragma unroll 64
-
+    for(int j=0;j<64;j++)
-#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+    {
-#define PC32dn(j, r)   0
+        uint32_t T1, T2;
-#define QC32up(j, r)   0xFFFFFFFF
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
-#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
-
+        
-#define B32_0(x)    __byte_perm(x, 0, 0x4440)
+        #pragma unroll 7
-//((x) & 0xFF)
+        for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-#define B32_1(x)    __byte_perm(x, 0, 0x4441)
+        regs[0] = T1 + T2;
-//(((x) >> 8) & 0xFF)
+        regs[4] += T1;
-#define B32_2(x)    __byte_perm(x, 0, 0x4442)
+    }
 //(((x) >> 16) & 0xFF)
 #define B32_3(x)    __byte_perm(x, 0, 0x4443)
 //((x) >> 24)
 #if MAXWELL_OR_FERMI
 #define USE_SHARED 1
 // Maxwell and Fermi cards get the best speed with SHARED access it seems.
 #if USE_SHARED
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
 #else
 #define T0up(x) tex1Dfetch(t0up1, x)
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) tex1Dfetch(t1dn1, x)
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) tex1Dfetch(t2dn1, x)
 #define T3up(x) tex1Dfetch(t3up1, x)
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 #else
 #define USE_SHARED 1
 // a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 texture<unsigned int, 1, cudaReadModeElementType> t0up1;
 texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t1up1;
 texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t2up1;
 texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t3up1;
 texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
-extern uint32_t T0up_cpu[];
+#pragma unroll 8
-extern uint32_t T0dn_cpu[];
+    for(int k=0;k<8;k++)
-extern uint32_t T1up_cpu[];
+        hash[k] += regs[k];
 extern uint32_t T1dn_cpu[];
 extern uint32_t T2up_cpu[];
 extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
-#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+    //// FERTIG
 #pragma unroll 8
    for(int k=0;k<8;k++)
        message[k] = SWAB32(hash[k]);
 }
-__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs)
+__global__ void __launch_bounds__(256, 4)
 myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer)
 {
-	uint32_t t[32];
+    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
-//#pragma unroll 14
+    if (thread < threads)
-	for(int r=0;r<14;r++)
+    {
-	{
+        // GROESTL
-		switch(r)
+        uint32_t paddedInput[8];
-		{
+#pragma unroll 8
-			case 0:
+        for(int k=0;k<8;k++) paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4];
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
 			case 1:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
 			case 2:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
 			case 3:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
 			case 4:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
 			case 5:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
 			case 6:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
 			case 7:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
 			case 8:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
 			case 9:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
 			case 10:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
 			case 11:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
 			case 12:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
 			case 13:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
 		}
-        // RBTT
+        uint32_t nounce = startNounce + thread;
-#pragma unroll 16
+        if ((threadIdx.x % 4) == 3)
-        for(int k=0;k<32;k+=2)
+            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
        {
            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
+        uint32_t msgBitsliced[8];
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+        to_bitslice_quad(paddedInput, msgBitsliced);
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
+        uint32_t state[8];
                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
-__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs)
+        groestl512_progressMessage_quad(state, msgBitsliced);
 {	
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
 		uint32_t t[32];
 		switch(r)
 		{
 			case 0:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
 			case 1:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
 			case 2:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
 			case 3:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
 			case 4:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
 			case 5:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
 			case 6:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
 			case 7:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
 			case 8:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
 			case 9:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
 			case 10:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
 			case 11:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
 			case 12:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
 			case 13:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
 		}
        // RBTT
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
-            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
+        uint32_t out_state[16];
-                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
+        from_bitslice_quad(state, out_state);
-            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
+        if ((threadIdx.x & 0x03) == 0)
-                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+        {
            uint32_t *outpHash = &hashBuffer[16 * thread];
 #pragma unroll 16
            for(int k=0;k<16;k++) outpHash[k] = out_state[k];
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
 __global__ void
-myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
+ myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
 	if (threadIdx.x < 256)
 	{
 		*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
 	}
 	__syncthreads();
 #endif
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
-		// GROESTL
+        uint32_t nounce = startNounce + thread;
 		uint32_t message[32];
 		uint32_t state[32];
 #pragma unroll 32
 		for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k];
 		uint32_t nounce = startNounce + thread;
 		message[19] = SWAB32(nounce);
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] = message[u];
 		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
 		myriadgroestl_perm_P(state, mixtabs);
 		state[31] ^= 0x20000;
 		myriadgroestl_perm_Q(message, mixtabs);
 #else
 		myriadgroestl_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		myriadgroestl_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 #pragma unroll 32
 		for(int u=0;u<32;u++) message[u] = state[u];
 #if USE_SHARED
 		myriadgroestl_perm_P(message, mixtabs);
 #else
 		myriadgroestl_perm_P(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
        uint32_t out_state[16];
        uint32_t *inpHash = &hashBuffer[16 * thread];
 #pragma unroll 16
-		for(int u=0;u<16;u++) out_state[u] = state[u+16];
+        for (int i=0; i < 16; i++)
            out_state[i] = inpHash[i];
        myriadgroestl_gpu_sha256(out_state);
        int i, position = -1;
@ -526,43 +309,35 @@ myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
    }
 }
 #define texDef(texname, texmem, texsource, texsize) \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
 // Setup-Funktionen
 __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(device_map[thr_id]);
+    cudaSetDevice(device_map[thr_id]);
    cudaMemcpyToSymbol( myr_sha256_gpu_hashTable,
                        myr_sha256_cpu_hashTable,
                        sizeof(uint32_t) * 8 );
-	cudaMemcpyToSymbol(	myr_sha256_gpu_hashTable,
+    cudaMemcpyToSymbol( myr_sha256_gpu_constantTable,
-						myr_sha256_cpu_hashTable,
+                        myr_sha256_cpu_constantTable,
-						sizeof(uint32_t) * 8 );
+                        sizeof(uint32_t) * 64 );
-	cudaMemcpyToSymbol(	myr_sha256_gpu_constantTable,
+    // zweite CPU-Tabelle bauen und auf die GPU laden
-						myr_sha256_cpu_constantTable,
+    uint32_t temp[64];
-						sizeof(uint32_t) * 64 );
+    for(int i=0;i<64;i++)
        temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i];
-    cudaGetDeviceProperties(&props, device_map[thr_id]);
+    cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2,
                        temp,
                        sizeof(uint32_t) * 64 );
-	// Texturen mit obigem Makro initialisieren
+    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
 	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
 	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
 	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
 	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
 	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
 	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
 	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
    // Speicher für Gewinner-Nonce belegen
    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
    // Speicher für temporäreHashes
    cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads); 
 }
 __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -594,25 +369,23 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn
 __host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
-	// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
+    int threadsperblock = 256;
 	// alle anderen mit 512 Threads.
 	int threadsperblock = (props.major >= 3) ? 768 : 512;
-    // berechne wie viele Thread Blocks wir brauchen
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    dim3 block(threadsperblock);
+    const int factor=4;
-	// Größe des dynamischen Shared Memory Bereichs
+    // Größe des dynamischen Shared Memory Bereichs
-#if USE_SHARED
+    size_t shared_size = 0;
 	size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
 	size_t shared_size = 0;
 #endif
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    //fprintf(stderr, "ThrID: %d\n", thr_id);
    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    myriadgroestl_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);
    myriadgroestl_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_outputHashes[thr_id]);
    dim3 grid2((threads + threadsperblock-1)/threadsperblock);
    myriadgroestl_gpu_hash_quad2<<<grid2, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
    MyStreamSynchronize(NULL, 0, thr_id);
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -85,7 +85,6 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/groestl_functions_quad.cu
+++ b/groestl_functions_quad.cu
@ -0,0 +1,315 @@
 __device__ __forceinline__ void G256_Mul2(uint32_t *regs)
 {
    uint32_t tmp = regs[7];
    regs[7] = regs[6];
    regs[6] = regs[5];
    regs[5] = regs[4];
    regs[4] = regs[3] ^ tmp;
    regs[3] = regs[2] ^ tmp;
    regs[2] = regs[1];
    regs[1] = regs[0] ^ tmp;
    regs[0] = tmp;
 }
 __device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
 {
    x0 = ~x0;
    x1 = ~x1;
    x2 = ~x2;
    x3 = ~x3;
    x4 = ~x4;
    x5 = ~x5;
    x6 = ~x6;
    x7 = ~x7;
    if ((threadIdx.x & 0x03) == 3) {
        x0 ^= ((- (round & 0x01)    ) & 0xFFFF0000);
        x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000);
        x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000);
        x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000);
        x4 ^= 0xAAAA0000;
        x5 ^= 0xCCCC0000;
        x6 ^= 0xF0F00000;
        x7 ^= 0xFF000000;
    }
 }
 __device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
 {
    if ((threadIdx.x & 0x03) == 0)
    {
        x4 ^= 0xAAAA;
        x5 ^= 0xCCCC;
        x6 ^= 0xF0F0;
        x7 ^= 0xFF00;
        x0 ^= ((- (round & 0x01)    ) & 0xFFFF);
        x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF);
        x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF);
        x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF);
    }
 }
 __device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
                                       uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
 {
    uint32_t t0,t1,t2;
    t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
    t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
    t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
    t0 = (x2^x3) & (y2^y3);
    x3 = (x3 & y3) ^ t0 ^ t1;
    x2 = (x2 & y2) ^ t0 ^ t2;
    t0 = (x0^x1) & (y0^y1);
    x1 = (x1 & y1) ^ t0 ^ t1;
    x0 = (x0 & y0) ^ t0 ^ t2;
 }
 __device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
 {
    uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
    t3 = x7;
    t2 = x6;
    t1 = x5;
    t0 = x4;
    G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
    a = (x4 ^ x0);
    t0 ^= a;
    t2 ^= (x7 ^ x3) ^ (x5 ^ x1); 
    t1 ^= (x5 ^ x1) ^ a;
    t3 ^= (x6 ^ x2) ^ a;
    b = t0 ^ t1;
    t4 = (t2 ^ t3) & b;
    a = t4 ^ t3 ^ t1;
    t5 = (t3 & t1) ^ a;
    t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
    t4 = (t5 ^ t6) & b;
    t1 = (t6 & t1) ^ t4;
    t0 = (t5 & t0) ^ t4;
    t4 = (t5 ^ t6) & (t2^t3);
    t3 = (t6 & t3) ^ t4;
    t2 = (t5 & t2) ^ t4;
    G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
    G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
 }
 __device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
 {
    uint32_t t0, t1;
    t0 = x0 ^ x1 ^ x2;
    t1 = x5 ^ x6;
    x2 = t0 ^ t1 ^ x7;
    x6 = t0 ^ x3 ^ x6;
    x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;    
    x4 = x0 ^ x4 ^ t1;
    x2 = t0 ^ t1 ^ x7;
    x1 = x0 ^ x1 ^ t1;
    x7 = x0 ^ t1 ^ x7;
    x5 = x0 ^ t1;
 }
 __device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
 {
    uint32_t t0,t2,t3,t5;
    x1 ^= x4;
    t0 = x1 ^ x6;
    x1 ^= x5;
    t2 = x0 ^ x2;
    x2 = x3 ^ x5;
    t2 ^= x2 ^ x6;
    x2 ^= x7;
    t3 = x4 ^ x2 ^ x6;
    t5 = x0 ^ x6;
    x4 = x3 ^ x7;
    x0 = x3 ^ x5;
    x6 = t0;    
    x3 = t2;
    x7 = t3;    
    x5 = t5;    
 }
 __device__ __forceinline__ void sbox_quad(uint32_t *r)
 {
    transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
    G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
    transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
    r[0] = ~r[0];
    r[1] = ~r[1];
    r[5] = ~r[5];
    r[6] = ~r[6];
 }
 __device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
 {
    uint32_t t0,t1;
    int tpos = threadIdx.x & 0x03;
    int shift1 = tpos << 1;
    int shift2 = shift1+1 + ((tpos == 3)<<2);
    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
    x0 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
    x1 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
    x2 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
    x3 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
    x4 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
    x5 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
    x6 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
    x7 = __byte_perm(t0, t1, 0x5410);
 }
 __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
 {
    uint32_t t0,t1;
    int tpos = threadIdx.x & 0x03;
    int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
    int shift2 = shift1+2 + ((tpos == 1)<<2);
    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
    x0 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
    x1 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
    x2 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
    x3 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
    x4 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
    x5 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
    x6 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
    x7 = __byte_perm(t0, t1, 0x5410);
 }
 __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
 {
 #define SHIFT64_16(hi, lo)    __byte_perm(lo, hi, 0x5432)
 #define A(v, u)             __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
 #define S(idx, l)            SHIFT64_16( A(idx, (l+1)), A(idx, l) )
 #define DOUBLE_ODD(i, bc)        ( S(i, (bc)) ^ A(i, (bc) + 1) )
 #define DOUBLE_EVEN(i, bc)        ( S(i, (bc)) ^ A(i, (bc)    ) )
 #define SINGLE_ODD(i, bc)        ( S(i, (bc)) )
 #define SINGLE_EVEN(i, bc)        ( A(i, (bc)) )
    uint32_t b[8];
 #pragma unroll 8
    for(int i=0;i<8;i++)
        b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
    G256_Mul2(b);
 #pragma unroll 8
    for(int i=0;i<8;i++)
        b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
    G256_Mul2(b);
 #pragma unroll 8
    for(int i=0;i<8;i++)
        r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
 #undef S
 #undef A
 #undef SHIFT64_16
 #undef t
 #undef X
 }
 __device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r)
 {
    for(int round=0;round<14;round++)
    {
        G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
        sbox_quad(r);
        G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
        G256_MixFunction_quad(r);
    }
 }
 __device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r)
 {    
    for(int round=0;round<14;round++)
    {
        G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
        sbox_quad(r);
        G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
        G256_MixFunction_quad(r);
    }
 }
 __device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
 {
 #pragma unroll 8
    for(int u=0;u<8;u++) state[u] = message[u];
    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
    groestl512_perm_P_quad(state);
    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
    groestl512_perm_Q_quad(message);
 #pragma unroll 8
    for(int u=0;u<8;u++) state[u] ^= message[u];
 #pragma unroll 8
    for(int u=0;u<8;u++) message[u] = state[u];
    groestl512_perm_P_quad(message);
 #pragma unroll 8
    for(int u=0;u<8;u++) state[u] ^= message[u];
 }
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@ -15,43 +15,42 @@
 void sha256func(unsigned char *hash, const unsigned char *data, int len)
 {
-	uint32_t S[16], T[16];
+    uint32_t S[16], T[16];
-	int i, r;
+    int i, r;
-
+
-	sha256_init(S);
+    sha256_init(S);
-	for (r = len; r > -9; r -= 64) {
+    for (r = len; r > -9; r -= 64) {
-		if (r < 64)
+        if (r < 64)
-			memset(T, 0, 64);
+            memset(T, 0, 64);
-		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
+        memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
-		if (r >= 0 && r < 64)
+        if (r >= 0 && r < 64)
-			((unsigned char *)T)[r] = 0x80;
+            ((unsigned char *)T)[r] = 0x80;
-		for (i = 0; i < 16; i++)
+        for (i = 0; i < 16; i++)
-			T[i] = be32dec(T + i);
+            T[i] = be32dec(T + i);
-		if (r < 56)
+        if (r < 56)
-			T[15] = 8 * len;
+            T[15] = 8 * len;
-		sha256_transform(S, T, 0);
+        sha256_transform(S, T, 0);
-	}
+    }
-	/*
+    /*
-	memcpy(S + 8, sha256d_hash1 + 8, 32);
+    memcpy(S + 8, sha256d_hash1 + 8, 32);
-	sha256_init(T);
+    sha256_init(T);
-	sha256_transform(T, S, 0);
+    sha256_transform(T, S, 0);
-	*/
+    */
-	for (i = 0; i < 8; i++)
+    for (i = 0; i < 8; i++)
-		be32enc((uint32_t *)hash + i, T[i]);
+        be32enc((uint32_t *)hash + i, T[i]);
 }
 static void groestlhash(void *state, const void *input)
 {
-	// Tryout GPU-groestl
+    // Tryout GPU-groestl
    sph_groestl512_context     ctx_groestl[2];
    static unsigned char pblank[1];
 	int ii;
    uint32_t mask = 8;
    uint32_t zero = 0;
-	//these uint512 in the c++ source of the client are backed by an array of uint32
+    //these uint512 in the c++ source of the client are backed by an array of uint32
    uint32_t hashA[16], hashB[16];    
@ -59,119 +58,75 @@ static void groestlhash(void *state, const void *input)
    sph_groestl512 (&ctx_groestl[0], input, 80); //6
    sph_groestl512_close(&ctx_groestl[0], hashA); //7    
-	sph_groestl512_init(&ctx_groestl[1]);
+    sph_groestl512_init(&ctx_groestl[1]);
-	sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
+    sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
    sph_groestl512_close(&ctx_groestl[1], hashB); //7
-	memcpy(state, hashB, 32);
+    memcpy(state, hashB, 32);
 }
-
+extern bool opt_benchmark;
 extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
-	uint32_t max_nonce, unsigned long *hashes_done)
+    uint32_t max_nonce, unsigned long *hashes_done)
 {    
-	uint32_t start_nonce = pdata[19]++;
+    if (opt_benchmark)
-	const uint32_t Htarg = ptarget[7];
+        ((uint32_t*)ptarget)[7] = 0x000000ff;
-	const uint32_t throughPut = 4096 * 128;
+
-	//const uint32_t throughPut = 1;
+    uint32_t start_nonce = pdata[19]++;
-	int i;
+    const uint32_t Htarg = ptarget[7];
-	uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
+    const uint32_t throughPut = 4096 * 128;
-
+    //const uint32_t throughPut = 1;
-	// init
+    uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
-	static bool init[8] = { false, false, false, false, false, false, false, false };
+
-	if(!init[thr_id])
+    // init
-	{
+    static bool init[8] = { false, false, false, false, false, false, false, false };
-		groestlcoin_cpu_init(thr_id, throughPut);
+    if(!init[thr_id])
-		init[thr_id] = true;
+    {
-	}
+        groestlcoin_cpu_init(thr_id, throughPut);
-	
+        init[thr_id] = true;
-	// Endian Drehung ist notwendig
+    }
-	//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
+    
-	//pdata = (uint32_t*)testdata;
+    // Endian Drehung ist notwendig
-	uint32_t endiandata[32];
+    uint32_t endiandata[32];
-	for (int kk=0; kk < 32; kk++)
+    for (int kk=0; kk < 32; kk++)
-		be32enc(&endiandata[kk], pdata[kk]);
+        be32enc(&endiandata[kk], pdata[kk]);
-
+
-	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
+    // Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
-	groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
+    groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
-	
+    
-	do {
+    do {
-		// GPU
+        // GPU
-		uint32_t foundNounce = 0xFFFFFFFF;
+        uint32_t foundNounce = 0xFFFFFFFF;
-
+
-		groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
+        groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
-
+
-		/*
+        if(foundNounce < 0xffffffff)
-		{
+        {
-			for(i=0;i<throughPut;i++)
+            uint32_t tmpHash[8];
-			{
+            endiandata[19] = SWAP32(foundNounce);
-				uint32_t tmpHash[8];
+            groestlhash(tmpHash, endiandata);
-				endiandata[19] = SWAP32(pdata[19]);
+            if (tmpHash[7] <= Htarg && 
-				groestlhash(tmpHash, endiandata);
+                    fulltest(tmpHash, ptarget)) {
-				
+                        pdata[19] = foundNounce;
-				int ii;
+                        *hashes_done = foundNounce - start_nonce;
-				printf("result GPU: ");
+                        free(outputHash);
-				for (ii=0; ii < 32; ii++)
+                return true;
-				{
+            } else {
-					printf ("%.2x",((uint8_t*)&outputHash[8*i])[ii]);
+                applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
-				};
+            }
-				printf ("\n");	
+
-		
+            foundNounce = 0xffffffff;
-
+        }
-				groestlhash(tmpHash, endiandata);
+
-				printf("result CPU: ");
+        if (pdata[19] + throughPut < pdata[19])
-				for (ii=0; ii < 32; ii++)
+            pdata[19] = max_nonce;
-				{
+        else pdata[19] += throughPut;
-					printf ("%.2x",((uint8_t*)tmpHash)[ii]);
+
-				};
+    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-				
+    
-				
+    *hashes_done = pdata[19] - start_nonce;
-			}
+    free(outputHash);
-			exit(0);
+    return 0;
 		}		
 		*/
 		if(foundNounce < 0xffffffff)
 		{
 			uint32_t tmpHash[8];
 			endiandata[19] = SWAP32(foundNounce);
 			groestlhash(tmpHash, endiandata);
 			if (tmpHash[7] <= Htarg && 
 					fulltest(tmpHash, ptarget)) {
 						pdata[19] = foundNounce;
 						*hashes_done = foundNounce - start_nonce;
 						free(outputHash);
 				return true;
 			} else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
 			}
 			foundNounce = 0xffffffff;
 			/*
 			int ii;
 			printf("result GPU: ");
 			for (ii=0; ii < 32; ii++)
 			{
 				printf ("%.2x",((uint8_t*)&outputHash[0])[ii]);
 			};
 			printf ("\n");	
 			printf("result CPU: ");
 			for (ii=0; ii < 32; ii++)
 			{
 				printf ("%.2x",((uint8_t*)tmpHash)[ii]);
 			};
 			printf ("\n");	
 			*/
 		}
 		if (pdata[19] + throughPut < pdata[19])
 			pdata[19] = max_nonce;
 		else pdata[19] += throughPut;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - start_nonce;
 	free(outputHash);
 	return 0;
 }
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@ -269,8 +269,6 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE == 80)
 		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 84)
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@ -141,8 +141,6 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
 	// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@ -824,8 +824,6 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE == 84)
 		groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
 	else if (BLOCKSIZE == 80)
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@ -416,8 +416,6 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
    size_t shared_size = 0;
 #endif
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@ -279,7 +279,6 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE==84)
 		keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE==80)
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@ -271,7 +271,6 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE == 84)
 		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 80) {
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@ -35,17 +35,19 @@ static void myriadhash(void *state, const void *input)
    memcpy(state, hashB, 32);
 }
-
+extern bool opt_benchmark;
 extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {	
    if (opt_benchmark)
        ((uint32_t*)ptarget)[7] = 0x000000ff;
 	uint32_t start_nonce = pdata[19]++;
 	const uint32_t throughPut = 128 * 1024;
-//	const uint32_t throughPut = 1;
+
 	uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -175,7 +175,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x00000f;
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@ -447,8 +447,6 @@ __host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
@ -464,8 +462,6 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@ -350,8 +350,6 @@ __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -406,8 +406,6 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
@ -425,8 +423,6 @@ __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
--- a/quark/cuda_quark_checkhash.cu
+++ b/quark/cuda_quark_checkhash.cu
@ -89,8 +89,6 @@ __host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t star
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -1,4 +1,4 @@
-// Auf QuarkCoin spezialisierte Version von Groestl
+// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
 #include <cuda.h>
 #include "cuda_runtime.h"
@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 // it's unfortunate that this is a compile time constant.
 #define MAXWELL_OR_FERMI 1
 // aus cpu-miner.c
 extern int device_map[8];
@ -18,353 +15,137 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+// 64 Register Variante für Compute 3.0
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#include "groestl_functions_quad.cu"
-
+#include "bitslice_transformations_quad.cu"
 #define PC32up(j, r)   ((uint32_t)((j) + (r)))
 #define PC32dn(j, r)   0
 #define QC32up(j, r)   0xFFFFFFFF
 #define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
 #define B32_0(x)    __byte_perm(x, 0, 0x4440)
 //((x) & 0xFF)
 #define B32_1(x)    __byte_perm(x, 0, 0x4441)
 //(((x) >> 8) & 0xFF)
 #define B32_2(x)    __byte_perm(x, 0, 0x4442)
 //(((x) >> 16) & 0xFF)
 #define B32_3(x)    __byte_perm(x, 0, 0x4443)
 //((x) >> 24)
 #if MAXWELL_OR_FERMI
 #define USE_SHARED 1
 // Maxwell and Fermi cards get the best speed with SHARED access it seems.
 #if USE_SHARED
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
 #else
 #define T0up(x) tex1Dfetch(t0up1, x)
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) tex1Dfetch(t1dn1, x)
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) tex1Dfetch(t2dn1, x)
 #define T3up(x) tex1Dfetch(t3up1, x)
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 #else
 #define USE_SHARED 1
 // a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 texture<unsigned int, 1, cudaReadModeElementType> t0up1;
 texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t1up1;
 texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t2up1;
 texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t3up1;
 texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
 extern uint32_t T0up_cpu[];
 extern uint32_t T0dn_cpu[];
 extern uint32_t T1up_cpu[];
 extern uint32_t T1dn_cpu[];
 extern uint32_t T2up_cpu[];
 extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 __device__ __forceinline__ void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
 {
    uint32_t t[32];
-//#pragma unroll 14
+__global__ void __launch_bounds__(256, 4)
-    for(int r=0;r<14;r++)
+ quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
    int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
    if (thread < threads)
    {
-        switch(r)
+        // GROESTL
-        {
+        uint32_t message[8];
-            case 0:
+        uint32_t state[8];
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 0); break;
            case 1:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 1); break;
            case 2:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 2); break;
            case 3:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 3); break;
            case 4:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 4); break;
            case 5:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 5); break;
            case 6:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 6); break;
            case 7:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 7); break;
            case 8:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 8); break;
            case 9:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 9); break;
            case 10:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 10); break;
            case 11:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 11); break;
            case 12:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 12); break;
            case 13:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 13); break;
        }
-        // RBTT
+        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
+        int hashPosition = nounce - startNounce;
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+        uint32_t *inpHash = &g_hash[hashPosition<<4];
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
+#pragma unroll 4
-                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
+        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
-        }
+#pragma unroll 4
-#pragma unroll 32
+        for(int k=4;k<8;k++) message[k] = 0;
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
-__device__ __forceinline__ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
+        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
-{    
+        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
 //#pragma unroll 14
    for(int r=0;r<14;r++)
    {
        uint32_t t[32];
-        switch(r)
+        uint32_t msgBitsliced[8];
-        {
+        to_bitslice_quad(message, msgBitsliced);
            case 0:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 0); a[(k*2)+1] ^= QC32dn(k<< 4, 0);} break;
            case 1:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 1); a[(k*2)+1] ^= QC32dn(k<< 4, 1);} break;
            case 2:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 2); a[(k*2)+1] ^= QC32dn(k<< 4, 2);} break;
            case 3:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 3); a[(k*2)+1] ^= QC32dn(k<< 4, 3);} break;
            case 4:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 4); a[(k*2)+1] ^= QC32dn(k<< 4, 4);} break;
            case 5:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 5); a[(k*2)+1] ^= QC32dn(k<< 4, 5);} break;
            case 6:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 6); a[(k*2)+1] ^= QC32dn(k<< 4, 6);} break;
            case 7:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 7); a[(k*2)+1] ^= QC32dn(k<< 4, 7);} break;
            case 8:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 8); a[(k*2)+1] ^= QC32dn(k<< 4, 8);} break;
            case 9:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 9); a[(k*2)+1] ^= QC32dn(k<< 4, 9);} break;
            case 10:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 10); a[(k*2)+1] ^= QC32dn(k<< 4, 10);} break;
            case 11:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 11); a[(k*2)+1] ^= QC32dn(k<< 4, 11);} break;
            case 12:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 12); a[(k*2)+1] ^= QC32dn(k<< 4, 12);} break;
            case 13:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 13); a[(k*2)+1] ^= QC32dn(k<< 4, 13);} break;
        }
-        // RBTT
+        groestl512_progressMessage_quad(state, msgBitsliced);
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
-            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
+        uint32_t *outpHash = &g_hash[hashPosition<<4];
        uint32_t hash[16];
        from_bitslice_quad(state, hash);
-            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
+        if ((threadIdx.x & 0x03) == 0)
-                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+        {
 #pragma unroll 16
            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
 __global__ void  quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
 #if USE_SHARED
    extern __shared__ char mixtabs[];
-    if (threadIdx.x < 256)
+__global__ void __launch_bounds__(256, 4)
-    {
+ quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
-        *((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
+{
-        *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
        *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
        *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
        *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
        *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
        *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
        *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
    }
    __syncthreads();
 #endif
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
        // GROESTL
-        uint32_t message[32];
+        uint32_t message[8];
-        uint32_t state[32];
+        uint32_t state[8];
        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
        int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[16 * hashPosition];
+        uint32_t *inpHash = &g_hash[hashPosition<<4];
-#pragma unroll 16
+#pragma unroll 4
-        for(int k=0;k<16;k++) message[k] = inpHash[k];
+        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
-#pragma unroll 14
+#pragma unroll 4
-        for(int k=1;k<15;k++)
+        for(int k=4;k<8;k++) message[k] = 0;
-            message[k+16] = 0;
+
-
+        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
-        message[16] = 0x80;
+        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
-        message[31] = 0x01000000;
+
-
+        uint32_t msgBitsliced[8];
-#pragma unroll 32
+        to_bitslice_quad(message, msgBitsliced);
-        for(int u=0;u<32;u++) state[u] = message[u];
+
-        state[31] ^= 0x20000;
+        for (int round=0; round<2; round++)
-
+        {
-        // Perm
+            groestl512_progressMessage_quad(state, msgBitsliced);
-#if USE_SHARED
+
-        quark_groestl512_perm_P(state, mixtabs);
+            if (round < 1)
-        state[31] ^= 0x20000;
+            {
-        quark_groestl512_perm_Q(message, mixtabs);
+                // Verkettung zweier Runden inclusive Padding.
-#else
+                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));
-        quark_groestl512_perm_P(state, NULL);
+                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
-        state[31] ^= 0x20000;
+                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
-        quark_groestl512_perm_Q(message, NULL);
+                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
-#endif
+                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
-#pragma unroll 32
+                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
-        for(int u=0;u<32;u++) state[u] ^= message[u];
+                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
-
+                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));
-#pragma unroll 32
+            }
-        for(int u=0;u<32;u++) message[u] = state[u];
+        }
-
+
-#if USE_SHARED
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-        quark_groestl512_perm_P(message, mixtabs);
+        uint32_t *outpHash = &g_hash[hashPosition<<4];
-#else
+        uint32_t hash[16];
-        quark_groestl512_perm_P(message, NULL);
+        from_bitslice_quad(state, hash);
 #endif
 #pragma unroll 32
        for(int u=0;u<32;u++) state[u] ^= message[u];
        // Erzeugten Hash rausschreiben
        uint32_t *outpHash = &g_hash[16 * hashPosition];
        if ((threadIdx.x & 0x03) == 0)
        {
 #pragma unroll 16
-        for(int k=0;k<16;k++) outpHash[k] = state[k+16];
+            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
    }
 }
 #define texDef(texname, texmem, texsource, texsize) \
    unsigned int *texmem; \
    cudaMalloc(&texmem, texsize); \
    cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
    texname.normalized = 0; \
    texname.filterMode = cudaFilterModePoint; \
    texname.addressMode[0] = cudaAddressModeClamp; \
    { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
      cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
 // Setup-Funktionen
 __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
 {
    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 // Texturen mit obigem Makro initialisieren
    texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
    texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
    texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
    texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
    texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
    texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
    texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
    texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
 }
 __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    // Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
+    int threadsperblock = 256;
-    // alle anderen mit 512 Threads.
+
-    int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
    const int factor = 4;
    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);
    // Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
    size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
    size_t shared_size = 0;
 #endif
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+    quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    //fprintf(stderr, "ThrID: %d\n", thr_id);
    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
    MyStreamSynchronize(NULL, order, thr_id);
@ -372,25 +153,20 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
 __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    // Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
+    int threadsperblock = 256;
-    // alle anderen mit 512 Threads.
+
-    int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
    const int factor = 4;
    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);
    // Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
    size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
    size_t shared_size = 0;
 #endif
-//  fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+    quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    //fprintf(stderr, "ThrID: %d\n", thr_id);
    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
    MyStreamSynchronize(NULL, order, thr_id);
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@ -175,8 +175,6 @@ __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t star
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@ -442,7 +442,6 @@ __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -157,7 +157,6 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@ -307,8 +307,6 @@ __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t star
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@ -225,8 +225,6 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@ -376,8 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@ -1372,8 +1372,6 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -162,11 +162,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
-	// TODO: entfernen für eine Release! Ist nur zum Testen!
+	if (opt_benchmark)
 	if (opt_benchmark) {
 		((uint32_t*)ptarget)[7] = 0x0000ff;
        pdata[17] = 0;
    }
 	const uint32_t Htarg = ptarget[7];