bump to revision V1.1 with Killer Groestl

11 years ago · 3b21069504
35 changed files with 1301 additions and 1387 deletions
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@ -567,8 +567,6 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t start
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -101,14 +101,12 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 	const uint32_t Htarg = ptarget[7];
 	const int throughput = 256*4096*4; // 100;
 	//const int throughput = 256*256*2+100; // 100;
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	if (!init[thr_id])
@ -167,16 +165,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
-		// Runde 2 (ohne Gröstl)
+		// Runde 3 (komplett)
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
 			d_branch1Nonces[thr_id], &nrm1,
-			d_branch3Nonces[thr_id], &nrm3,
+			d_branch2Nonces[thr_id], &nrm2,
 			order++);
-		// verfolge den skein-pfad weiter
+		if (nrm1+nrm2 == nrm3) {
-		quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+			quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
 			quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
 		// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
 		jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
@ -226,7 +226,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
 				pdata[19] = foundNonce;
-				*hashes_done = (foundNonce - first_nonce + 1)/4;
+				*hashes_done = (foundNonce - first_nonce + 1)/2;
 				//applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
 				return 1;
 			} else {
@ -238,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-	*hashes_done = (pdata[19] - first_nonce + 1)/4;
+	*hashes_done = (pdata[19] - first_nonce + 1)/2;
 	return 0;
 }
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?"
+ccMiner release 1.1 (June 14th 2014) - "Killer Groestl!"
 -------------------------------------------------------------
 ***************************************************************
@ -30,13 +30,12 @@ FugueCoin
 GroestlCoin & Myriad-Groestl
 JackpotCoin
 QuarkCoin family & AnimeCoin
 TalkCoin
 DarkCoin and other X11 coins
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL) implementations.
 X11 algo is being worked on. It will be released when we
 have achieved a nice nVidia advantage.
 We did not take a big effort on improving usability, so please set
 your parameters carefuly.
@ -140,6 +139,12 @@ features.
 >>> RELEASE HISTORY <<<
  June 14th 2014  released Killer Groestl quad version which I deem
                  sufficiently hard to port over to AMD. It isn't
                  the fastest option for Compute 3.5 and 5.0 cards,
                  but it is still much faster than the table based
                  versions.
  May 10th 2014   added X11, but without the bells & whistles
                  (no killer Groestl, SIMD hash quite slow still)
--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@ -0,0 +1,418 @@
 __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
 {
    int n = threadIdx.x % 4;
    uint32_t other[8];
 #pragma unroll 8
    for (int i = 0; i < 8; i++) {
        input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
        other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
        input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
        other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
        if (threadIdx.x & 1) {
            input[i] = __byte_perm(input[i], 0, 0x1032);
            other[i] = __byte_perm(other[i], 0, 0x1032);
        }
        output[i] = 0;
    }
    output[ 0] |=  (input[ 0] & 0x00000001);
    output[ 0] |= ((other[ 0] & 0x00000001) << 1);
    output[ 0] |= ((input[ 1] & 0x00000001) << 2);
    output[ 0] |= ((other[ 1] & 0x00000001) << 3);
    output[ 0] |= ((input[ 2] & 0x00000001) << 4);
    output[ 0] |= ((other[ 2] & 0x00000001) << 5);
    output[ 0] |= ((input[ 3] & 0x00000001) << 6);
    output[ 0] |= ((other[ 3] & 0x00000001) << 7);
    output[ 0] |= ((input[ 4] & 0x00000001) << 8);
    output[ 0] |= ((other[ 4] & 0x00000001) << 9);
    output[ 0] |= ((input[ 5] & 0x00000001) <<10);
    output[ 0] |= ((other[ 5] & 0x00000001) <<11);
    output[ 0] |= ((input[ 6] & 0x00000001) <<12);
    output[ 0] |= ((other[ 6] & 0x00000001) <<13);
    output[ 0] |= ((input[ 7] & 0x00000001) <<14);
    output[ 0] |= ((other[ 7] & 0x00000001) <<15);
    output[ 0] |= ((input[ 0] & 0x00000100) << 8);
    output[ 0] |= ((other[ 0] & 0x00000100) << 9);
    output[ 0] |= ((input[ 1] & 0x00000100) <<10);
    output[ 0] |= ((other[ 1] & 0x00000100) <<11);
    output[ 0] |= ((input[ 2] & 0x00000100) <<12);
    output[ 0] |= ((other[ 2] & 0x00000100) <<13);
    output[ 0] |= ((input[ 3] & 0x00000100) <<14);
    output[ 0] |= ((other[ 3] & 0x00000100) <<15);
    output[ 0] |= ((input[ 4] & 0x00000100) <<16);
    output[ 0] |= ((other[ 4] & 0x00000100) <<17);
    output[ 0] |= ((input[ 5] & 0x00000100) <<18);
    output[ 0] |= ((other[ 5] & 0x00000100) <<19);
    output[ 0] |= ((input[ 6] & 0x00000100) <<20);
    output[ 0] |= ((other[ 6] & 0x00000100) <<21);
    output[ 0] |= ((input[ 7] & 0x00000100) <<22);
    output[ 0] |= ((other[ 7] & 0x00000100) <<23);
    output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
    output[ 1] |=  (other[ 0] & 0x00000002);
    output[ 1] |= ((input[ 1] & 0x00000002) << 1);
    output[ 1] |= ((other[ 1] & 0x00000002) << 2);
    output[ 1] |= ((input[ 2] & 0x00000002) << 3);
    output[ 1] |= ((other[ 2] & 0x00000002) << 4);
    output[ 1] |= ((input[ 3] & 0x00000002) << 5);
    output[ 1] |= ((other[ 3] & 0x00000002) << 6);
    output[ 1] |= ((input[ 4] & 0x00000002) << 7);
    output[ 1] |= ((other[ 4] & 0x00000002) << 8);
    output[ 1] |= ((input[ 5] & 0x00000002) << 9);
    output[ 1] |= ((other[ 5] & 0x00000002) <<10);
    output[ 1] |= ((input[ 6] & 0x00000002) <<11);
    output[ 1] |= ((other[ 6] & 0x00000002) <<12);
    output[ 1] |= ((input[ 7] & 0x00000002) <<13);
    output[ 1] |= ((other[ 7] & 0x00000002) <<14);
    output[ 1] |= ((input[ 0] & 0x00000200) << 7);
    output[ 1] |= ((other[ 0] & 0x00000200) << 8);
    output[ 1] |= ((input[ 1] & 0x00000200) << 9);
    output[ 1] |= ((other[ 1] & 0x00000200) <<10);
    output[ 1] |= ((input[ 2] & 0x00000200) <<11);
    output[ 1] |= ((other[ 2] & 0x00000200) <<12);
    output[ 1] |= ((input[ 3] & 0x00000200) <<13);
    output[ 1] |= ((other[ 3] & 0x00000200) <<14);
    output[ 1] |= ((input[ 4] & 0x00000200) <<15);
    output[ 1] |= ((other[ 4] & 0x00000200) <<16);
    output[ 1] |= ((input[ 5] & 0x00000200) <<17);
    output[ 1] |= ((other[ 5] & 0x00000200) <<18);
    output[ 1] |= ((input[ 6] & 0x00000200) <<19);
    output[ 1] |= ((other[ 6] & 0x00000200) <<20);
    output[ 1] |= ((input[ 7] & 0x00000200) <<21);
    output[ 1] |= ((other[ 7] & 0x00000200) <<22);
    output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
    output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
    output[ 2] |=  (input[ 1] & 0x00000004);
    output[ 2] |= ((other[ 1] & 0x00000004) << 1);
    output[ 2] |= ((input[ 2] & 0x00000004) << 2);
    output[ 2] |= ((other[ 2] & 0x00000004) << 3);
    output[ 2] |= ((input[ 3] & 0x00000004) << 4);
    output[ 2] |= ((other[ 3] & 0x00000004) << 5);
    output[ 2] |= ((input[ 4] & 0x00000004) << 6);
    output[ 2] |= ((other[ 4] & 0x00000004) << 7);
    output[ 2] |= ((input[ 5] & 0x00000004) << 8);
    output[ 2] |= ((other[ 5] & 0x00000004) << 9);
    output[ 2] |= ((input[ 6] & 0x00000004) <<10);
    output[ 2] |= ((other[ 6] & 0x00000004) <<11);
    output[ 2] |= ((input[ 7] & 0x00000004) <<12);
    output[ 2] |= ((other[ 7] & 0x00000004) <<13);
    output[ 2] |= ((input[ 0] & 0x00000400) << 6);
    output[ 2] |= ((other[ 0] & 0x00000400) << 7);
    output[ 2] |= ((input[ 1] & 0x00000400) << 8);
    output[ 2] |= ((other[ 1] & 0x00000400) << 9);
    output[ 2] |= ((input[ 2] & 0x00000400) <<10);
    output[ 2] |= ((other[ 2] & 0x00000400) <<11);
    output[ 2] |= ((input[ 3] & 0x00000400) <<12);
    output[ 2] |= ((other[ 3] & 0x00000400) <<13);
    output[ 2] |= ((input[ 4] & 0x00000400) <<14);
    output[ 2] |= ((other[ 4] & 0x00000400) <<15);
    output[ 2] |= ((input[ 5] & 0x00000400) <<16);
    output[ 2] |= ((other[ 5] & 0x00000400) <<17);
    output[ 2] |= ((input[ 6] & 0x00000400) <<18);
    output[ 2] |= ((other[ 6] & 0x00000400) <<19);
    output[ 2] |= ((input[ 7] & 0x00000400) <<20);
    output[ 2] |= ((other[ 7] & 0x00000400) <<21);
    output[ 3] |= ((input[ 0] & 0x00000008) >> 3);
    output[ 3] |= ((other[ 0] & 0x00000008) >> 2);
    output[ 3] |= ((input[ 1] & 0x00000008) >> 1);
    output[ 3] |=  (other[ 1] & 0x00000008);
    output[ 3] |= ((input[ 2] & 0x00000008) << 1);
    output[ 3] |= ((other[ 2] & 0x00000008) << 2);
    output[ 3] |= ((input[ 3] & 0x00000008) << 3);
    output[ 3] |= ((other[ 3] & 0x00000008) << 4);
    output[ 3] |= ((input[ 4] & 0x00000008) << 5);
    output[ 3] |= ((other[ 4] & 0x00000008) << 6);
    output[ 3] |= ((input[ 5] & 0x00000008) << 7);
    output[ 3] |= ((other[ 5] & 0x00000008) << 8);
    output[ 3] |= ((input[ 6] & 0x00000008) << 9);
    output[ 3] |= ((other[ 6] & 0x00000008) <<10);
    output[ 3] |= ((input[ 7] & 0x00000008) <<11);
    output[ 3] |= ((other[ 7] & 0x00000008) <<12);
    output[ 3] |= ((input[ 0] & 0x00000800) << 5);
    output[ 3] |= ((other[ 0] & 0x00000800) << 6);
    output[ 3] |= ((input[ 1] & 0x00000800) << 7);
    output[ 3] |= ((other[ 1] & 0x00000800) << 8);
    output[ 3] |= ((input[ 2] & 0x00000800) << 9);
    output[ 3] |= ((other[ 2] & 0x00000800) <<10);
    output[ 3] |= ((input[ 3] & 0x00000800) <<11);
    output[ 3] |= ((other[ 3] & 0x00000800) <<12);
    output[ 3] |= ((input[ 4] & 0x00000800) <<13);
    output[ 3] |= ((other[ 4] & 0x00000800) <<14);
    output[ 3] |= ((input[ 5] & 0x00000800) <<15);
    output[ 3] |= ((other[ 5] & 0x00000800) <<16);
    output[ 3] |= ((input[ 6] & 0x00000800) <<17);
    output[ 3] |= ((other[ 6] & 0x00000800) <<18);
    output[ 3] |= ((input[ 7] & 0x00000800) <<19);
    output[ 3] |= ((other[ 7] & 0x00000800) <<20);
    output[ 4] |= ((input[ 0] & 0x00000010) >> 4);
    output[ 4] |= ((other[ 0] & 0x00000010) >> 3);
    output[ 4] |= ((input[ 1] & 0x00000010) >> 2);
    output[ 4] |= ((other[ 1] & 0x00000010) >> 1);
    output[ 4] |=  (input[ 2] & 0x00000010);
    output[ 4] |= ((other[ 2] & 0x00000010) << 1);
    output[ 4] |= ((input[ 3] & 0x00000010) << 2);
    output[ 4] |= ((other[ 3] & 0x00000010) << 3);
    output[ 4] |= ((input[ 4] & 0x00000010) << 4);
    output[ 4] |= ((other[ 4] & 0x00000010) << 5);
    output[ 4] |= ((input[ 5] & 0x00000010) << 6);
    output[ 4] |= ((other[ 5] & 0x00000010) << 7);
    output[ 4] |= ((input[ 6] & 0x00000010) << 8);
    output[ 4] |= ((other[ 6] & 0x00000010) << 9);
    output[ 4] |= ((input[ 7] & 0x00000010) <<10);
    output[ 4] |= ((other[ 7] & 0x00000010) <<11);
    output[ 4] |= ((input[ 0] & 0x00001000) << 4);
    output[ 4] |= ((other[ 0] & 0x00001000) << 5);
    output[ 4] |= ((input[ 1] & 0x00001000) << 6);
    output[ 4] |= ((other[ 1] & 0x00001000) << 7);
    output[ 4] |= ((input[ 2] & 0x00001000) << 8);
    output[ 4] |= ((other[ 2] & 0x00001000) << 9);
    output[ 4] |= ((input[ 3] & 0x00001000) <<10);
    output[ 4] |= ((other[ 3] & 0x00001000) <<11);
    output[ 4] |= ((input[ 4] & 0x00001000) <<12);
    output[ 4] |= ((other[ 4] & 0x00001000) <<13);
    output[ 4] |= ((input[ 5] & 0x00001000) <<14);
    output[ 4] |= ((other[ 5] & 0x00001000) <<15);
    output[ 4] |= ((input[ 6] & 0x00001000) <<16);
    output[ 4] |= ((other[ 6] & 0x00001000) <<17);
    output[ 4] |= ((input[ 7] & 0x00001000) <<18);
    output[ 4] |= ((other[ 7] & 0x00001000) <<19);
    output[ 5] |= ((input[ 0] & 0x00000020) >> 5);
    output[ 5] |= ((other[ 0] & 0x00000020) >> 4);
    output[ 5] |= ((input[ 1] & 0x00000020) >> 3);
    output[ 5] |= ((other[ 1] & 0x00000020) >> 2);
    output[ 5] |= ((input[ 2] & 0x00000020) >> 1);
    output[ 5] |=  (other[ 2] & 0x00000020);
    output[ 5] |= ((input[ 3] & 0x00000020) << 1);
    output[ 5] |= ((other[ 3] & 0x00000020) << 2);
    output[ 5] |= ((input[ 4] & 0x00000020) << 3);
    output[ 5] |= ((other[ 4] & 0x00000020) << 4);
    output[ 5] |= ((input[ 5] & 0x00000020) << 5);
    output[ 5] |= ((other[ 5] & 0x00000020) << 6);
    output[ 5] |= ((input[ 6] & 0x00000020) << 7);
    output[ 5] |= ((other[ 6] & 0x00000020) << 8);
    output[ 5] |= ((input[ 7] & 0x00000020) << 9);
    output[ 5] |= ((other[ 7] & 0x00000020) <<10);
    output[ 5] |= ((input[ 0] & 0x00002000) << 3);
    output[ 5] |= ((other[ 0] & 0x00002000) << 4);
    output[ 5] |= ((input[ 1] & 0x00002000) << 5);
    output[ 5] |= ((other[ 1] & 0x00002000) << 6);
    output[ 5] |= ((input[ 2] & 0x00002000) << 7);
    output[ 5] |= ((other[ 2] & 0x00002000) << 8);
    output[ 5] |= ((input[ 3] & 0x00002000) << 9);
    output[ 5] |= ((other[ 3] & 0x00002000) <<10);
    output[ 5] |= ((input[ 4] & 0x00002000) <<11);
    output[ 5] |= ((other[ 4] & 0x00002000) <<12);
    output[ 5] |= ((input[ 5] & 0x00002000) <<13);
    output[ 5] |= ((other[ 5] & 0x00002000) <<14);
    output[ 5] |= ((input[ 6] & 0x00002000) <<15);
    output[ 5] |= ((other[ 6] & 0x00002000) <<16);
    output[ 5] |= ((input[ 7] & 0x00002000) <<17);
    output[ 5] |= ((other[ 7] & 0x00002000) <<18);
    output[ 6] |= ((input[ 0] & 0x00000040) >> 6);
    output[ 6] |= ((other[ 0] & 0x00000040) >> 5);
    output[ 6] |= ((input[ 1] & 0x00000040) >> 4);
    output[ 6] |= ((other[ 1] & 0x00000040) >> 3);
    output[ 6] |= ((input[ 2] & 0x00000040) >> 2);
    output[ 6] |= ((other[ 2] & 0x00000040) >> 1);
    output[ 6] |=  (input[ 3] & 0x00000040);
    output[ 6] |= ((other[ 3] & 0x00000040) << 1);
    output[ 6] |= ((input[ 4] & 0x00000040) << 2);
    output[ 6] |= ((other[ 4] & 0x00000040) << 3);
    output[ 6] |= ((input[ 5] & 0x00000040) << 4);
    output[ 6] |= ((other[ 5] & 0x00000040) << 5);
    output[ 6] |= ((input[ 6] & 0x00000040) << 6);
    output[ 6] |= ((other[ 6] & 0x00000040) << 7);
    output[ 6] |= ((input[ 7] & 0x00000040) << 8);
    output[ 6] |= ((other[ 7] & 0x00000040) << 9);
    output[ 6] |= ((input[ 0] & 0x00004000) << 2);
    output[ 6] |= ((other[ 0] & 0x00004000) << 3);
    output[ 6] |= ((input[ 1] & 0x00004000) << 4);
    output[ 6] |= ((other[ 1] & 0x00004000) << 5);
    output[ 6] |= ((input[ 2] & 0x00004000) << 6);
    output[ 6] |= ((other[ 2] & 0x00004000) << 7);
    output[ 6] |= ((input[ 3] & 0x00004000) << 8);
    output[ 6] |= ((other[ 3] & 0x00004000) << 9);
    output[ 6] |= ((input[ 4] & 0x00004000) <<10);
    output[ 6] |= ((other[ 4] & 0x00004000) <<11);
    output[ 6] |= ((input[ 5] & 0x00004000) <<12);
    output[ 6] |= ((other[ 5] & 0x00004000) <<13);
    output[ 6] |= ((input[ 6] & 0x00004000) <<14);
    output[ 6] |= ((other[ 6] & 0x00004000) <<15);
    output[ 6] |= ((input[ 7] & 0x00004000) <<16);
    output[ 6] |= ((other[ 7] & 0x00004000) <<17);
    output[ 7] |= ((input[ 0] & 0x00000080) >> 7);
    output[ 7] |= ((other[ 0] & 0x00000080) >> 6);
    output[ 7] |= ((input[ 1] & 0x00000080) >> 5);
    output[ 7] |= ((other[ 1] & 0x00000080) >> 4);
    output[ 7] |= ((input[ 2] & 0x00000080) >> 3);
    output[ 7] |= ((other[ 2] & 0x00000080) >> 2);
    output[ 7] |= ((input[ 3] & 0x00000080) >> 1);
    output[ 7] |=  (other[ 3] & 0x00000080);
    output[ 7] |= ((input[ 4] & 0x00000080) << 1);
    output[ 7] |= ((other[ 4] & 0x00000080) << 2);
    output[ 7] |= ((input[ 5] & 0x00000080) << 3);
    output[ 7] |= ((other[ 5] & 0x00000080) << 4);
    output[ 7] |= ((input[ 6] & 0x00000080) << 5);
    output[ 7] |= ((other[ 6] & 0x00000080) << 6);
    output[ 7] |= ((input[ 7] & 0x00000080) << 7);
    output[ 7] |= ((other[ 7] & 0x00000080) << 8);
    output[ 7] |= ((input[ 0] & 0x00008000) << 1);
    output[ 7] |= ((other[ 0] & 0x00008000) << 2);
    output[ 7] |= ((input[ 1] & 0x00008000) << 3);
    output[ 7] |= ((other[ 1] & 0x00008000) << 4);
    output[ 7] |= ((input[ 2] & 0x00008000) << 5);
    output[ 7] |= ((other[ 2] & 0x00008000) << 6);
    output[ 7] |= ((input[ 3] & 0x00008000) << 7);
    output[ 7] |= ((other[ 3] & 0x00008000) << 8);
    output[ 7] |= ((input[ 4] & 0x00008000) << 9);
    output[ 7] |= ((other[ 4] & 0x00008000) <<10);
    output[ 7] |= ((input[ 5] & 0x00008000) <<11);
    output[ 7] |= ((other[ 5] & 0x00008000) <<12);
    output[ 7] |= ((input[ 6] & 0x00008000) <<13);
    output[ 7] |= ((other[ 6] & 0x00008000) <<14);
    output[ 7] |= ((input[ 7] & 0x00008000) <<15);
    output[ 7] |= ((other[ 7] & 0x00008000) <<16);
 }
 __device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
 {
 #pragma unroll 8
    for (int i=0; i < 16; i+=2) output[i] = 0;
    output[ 0] |= ((input[ 0] & 0x00000100) >> 8);
    output[ 0] |= ((input[ 1] & 0x00000100) >> 7);
    output[ 0] |= ((input[ 2] & 0x00000100) >> 6);
    output[ 0] |= ((input[ 3] & 0x00000100) >> 5);
    output[ 0] |= ((input[ 4] & 0x00000100) >> 4);
    output[ 0] |= ((input[ 5] & 0x00000100) >> 3);
    output[ 0] |= ((input[ 6] & 0x00000100) >> 2);
    output[ 0] |= ((input[ 7] & 0x00000100) >> 1);
    output[ 0] |= ((input[ 0] & 0x01000000) >>16);
    output[ 0] |= ((input[ 1] & 0x01000000) >>15);
    output[ 0] |= ((input[ 2] & 0x01000000) >>14);
    output[ 0] |= ((input[ 3] & 0x01000000) >>13);
    output[ 0] |= ((input[ 4] & 0x01000000) >>12);
    output[ 0] |= ((input[ 5] & 0x01000000) >>11);
    output[ 0] |= ((input[ 6] & 0x01000000) >>10);
    output[ 0] |= ((input[ 7] & 0x01000000) >> 9);
    output[ 2] |= ((input[ 0] & 0x00000200) >> 9);
    output[ 2] |= ((input[ 1] & 0x00000200) >> 8);
    output[ 2] |= ((input[ 2] & 0x00000200) >> 7);
    output[ 2] |= ((input[ 3] & 0x00000200) >> 6);
    output[ 2] |= ((input[ 4] & 0x00000200) >> 5);
    output[ 2] |= ((input[ 5] & 0x00000200) >> 4);
    output[ 2] |= ((input[ 6] & 0x00000200) >> 3);
    output[ 2] |= ((input[ 7] & 0x00000200) >> 2);
    output[ 2] |= ((input[ 0] & 0x02000000) >>17);
    output[ 2] |= ((input[ 1] & 0x02000000) >>16);
    output[ 2] |= ((input[ 2] & 0x02000000) >>15);
    output[ 2] |= ((input[ 3] & 0x02000000) >>14);
    output[ 2] |= ((input[ 4] & 0x02000000) >>13);
    output[ 2] |= ((input[ 5] & 0x02000000) >>12);
    output[ 2] |= ((input[ 6] & 0x02000000) >>11);
    output[ 2] |= ((input[ 7] & 0x02000000) >>10);
    output[ 4] |= ((input[ 0] & 0x00000400) >>10);
    output[ 4] |= ((input[ 1] & 0x00000400) >> 9);
    output[ 4] |= ((input[ 2] & 0x00000400) >> 8);
    output[ 4] |= ((input[ 3] & 0x00000400) >> 7);
    output[ 4] |= ((input[ 4] & 0x00000400) >> 6);
    output[ 4] |= ((input[ 5] & 0x00000400) >> 5);
    output[ 4] |= ((input[ 6] & 0x00000400) >> 4);
    output[ 4] |= ((input[ 7] & 0x00000400) >> 3);
    output[ 4] |= ((input[ 0] & 0x04000000) >>18);
    output[ 4] |= ((input[ 1] & 0x04000000) >>17);
    output[ 4] |= ((input[ 2] & 0x04000000) >>16);
    output[ 4] |= ((input[ 3] & 0x04000000) >>15);
    output[ 4] |= ((input[ 4] & 0x04000000) >>14);
    output[ 4] |= ((input[ 5] & 0x04000000) >>13);
    output[ 4] |= ((input[ 6] & 0x04000000) >>12);
    output[ 4] |= ((input[ 7] & 0x04000000) >>11);
    output[ 6] |= ((input[ 0] & 0x00000800) >>11);
    output[ 6] |= ((input[ 1] & 0x00000800) >>10);
    output[ 6] |= ((input[ 2] & 0x00000800) >> 9);
    output[ 6] |= ((input[ 3] & 0x00000800) >> 8);
    output[ 6] |= ((input[ 4] & 0x00000800) >> 7);
    output[ 6] |= ((input[ 5] & 0x00000800) >> 6);
    output[ 6] |= ((input[ 6] & 0x00000800) >> 5);
    output[ 6] |= ((input[ 7] & 0x00000800) >> 4);
    output[ 6] |= ((input[ 0] & 0x08000000) >>19);
    output[ 6] |= ((input[ 1] & 0x08000000) >>18);
    output[ 6] |= ((input[ 2] & 0x08000000) >>17);
    output[ 6] |= ((input[ 3] & 0x08000000) >>16);
    output[ 6] |= ((input[ 4] & 0x08000000) >>15);
    output[ 6] |= ((input[ 5] & 0x08000000) >>14);
    output[ 6] |= ((input[ 6] & 0x08000000) >>13);
    output[ 6] |= ((input[ 7] & 0x08000000) >>12);
    output[ 8] |= ((input[ 0] & 0x00001000) >>12);
    output[ 8] |= ((input[ 1] & 0x00001000) >>11);
    output[ 8] |= ((input[ 2] & 0x00001000) >>10);
    output[ 8] |= ((input[ 3] & 0x00001000) >> 9);
    output[ 8] |= ((input[ 4] & 0x00001000) >> 8);
    output[ 8] |= ((input[ 5] & 0x00001000) >> 7);
    output[ 8] |= ((input[ 6] & 0x00001000) >> 6);
    output[ 8] |= ((input[ 7] & 0x00001000) >> 5);
    output[ 8] |= ((input[ 0] & 0x10000000) >>20);
    output[ 8] |= ((input[ 1] & 0x10000000) >>19);
    output[ 8] |= ((input[ 2] & 0x10000000) >>18);
    output[ 8] |= ((input[ 3] & 0x10000000) >>17);
    output[ 8] |= ((input[ 4] & 0x10000000) >>16);
    output[ 8] |= ((input[ 5] & 0x10000000) >>15);
    output[ 8] |= ((input[ 6] & 0x10000000) >>14);
    output[ 8] |= ((input[ 7] & 0x10000000) >>13);
    output[10] |= ((input[ 0] & 0x00002000) >>13);
    output[10] |= ((input[ 1] & 0x00002000) >>12);
    output[10] |= ((input[ 2] & 0x00002000) >>11);
    output[10] |= ((input[ 3] & 0x00002000) >>10);
    output[10] |= ((input[ 4] & 0x00002000) >> 9);
    output[10] |= ((input[ 5] & 0x00002000) >> 8);
    output[10] |= ((input[ 6] & 0x00002000) >> 7);
    output[10] |= ((input[ 7] & 0x00002000) >> 6);
    output[10] |= ((input[ 0] & 0x20000000) >>21);
    output[10] |= ((input[ 1] & 0x20000000) >>20);
    output[10] |= ((input[ 2] & 0x20000000) >>19);
    output[10] |= ((input[ 3] & 0x20000000) >>18);
    output[10] |= ((input[ 4] & 0x20000000) >>17);
    output[10] |= ((input[ 5] & 0x20000000) >>16);
    output[10] |= ((input[ 6] & 0x20000000) >>15);
    output[10] |= ((input[ 7] & 0x20000000) >>14);
    output[12] |= ((input[ 0] & 0x00004000) >>14);
    output[12] |= ((input[ 1] & 0x00004000) >>13);
    output[12] |= ((input[ 2] & 0x00004000) >>12);
    output[12] |= ((input[ 3] & 0x00004000) >>11);
    output[12] |= ((input[ 4] & 0x00004000) >>10);
    output[12] |= ((input[ 5] & 0x00004000) >> 9);
    output[12] |= ((input[ 6] & 0x00004000) >> 8);
    output[12] |= ((input[ 7] & 0x00004000) >> 7);
    output[12] |= ((input[ 0] & 0x40000000) >>22);
    output[12] |= ((input[ 1] & 0x40000000) >>21);
    output[12] |= ((input[ 2] & 0x40000000) >>20);
    output[12] |= ((input[ 3] & 0x40000000) >>19);
    output[12] |= ((input[ 4] & 0x40000000) >>18);
    output[12] |= ((input[ 5] & 0x40000000) >>17);
    output[12] |= ((input[ 6] & 0x40000000) >>16);
    output[12] |= ((input[ 7] & 0x40000000) >>15);
    output[14] |= ((input[ 0] & 0x00008000) >>15);
    output[14] |= ((input[ 1] & 0x00008000) >>14);
    output[14] |= ((input[ 2] & 0x00008000) >>13);
    output[14] |= ((input[ 3] & 0x00008000) >>12);
    output[14] |= ((input[ 4] & 0x00008000) >>11);
    output[14] |= ((input[ 5] & 0x00008000) >>10);
    output[14] |= ((input[ 6] & 0x00008000) >> 9);
    output[14] |= ((input[ 7] & 0x00008000) >> 8);
    output[14] |= ((input[ 0] & 0x80000000) >>23);
    output[14] |= ((input[ 1] & 0x80000000) >>22);
    output[14] |= ((input[ 2] & 0x80000000) >>21);
    output[14] |= ((input[ 3] & 0x80000000) >>20);
    output[14] |= ((input[ 4] & 0x80000000) >>19);
    output[14] |= ((input[ 5] & 0x80000000) >>18);
    output[14] |= ((input[ 6] & 0x80000000) >>17);
    output[14] |= ((input[ 7] & 0x80000000) >>16);
 #pragma unroll 8
    for (int i = 0; i < 16; i+=2) {
        if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
        output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
        output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
        if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
    }
 }
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -287,6 +287,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
    <ClInclude Include="uint256.h" />
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="bitslice_transformations_quad.cu">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
    </CudaCompile>
    <CudaCompile Include="cuda_fugue256.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -311,6 +317,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <CudaCompile Include="groestl_functions_quad.cu">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
    </CudaCompile>
    <CudaCompile Include="heavy\cuda_blake512.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -340,5 +340,11 @@
    <CudaCompile Include="x11\simd_functions.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
    <CudaCompile Include="bitslice_transformations_quad.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="groestl_functions_quad.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.05.10])
+AC_INIT([ccminer], [2014.06.14])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -917,8 +917,8 @@ static void *miner_thread(void *userdata)
 			goto out;
 		}
-        if (opt_benchmark)
+//        if (opt_benchmark)
-            if (++rounds == 1) exit(0);
+//            if (++rounds == 1) exit(0);
 		/* record scanhash elapsed time */
 		gettimeofday(&tv_end, NULL);
@ -1469,7 +1469,7 @@ static void signal_handler(int sig)
 }
 #endif
-#define PROGRAM_VERSION "1.0"
+#define PROGRAM_VERSION "1.1"
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -152,7 +152,7 @@
 #define PACKAGE_NAME "ccminer"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.05.10"
+#define PACKAGE_STRING "ccminer 2014.06.14"
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
 #undef PACKAGE_URL
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.05.10"
+#define PACKAGE_VERSION "2014.06.14"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -1,4 +1,4 @@
-// Auf Groestlcoin spezialisierte Version von Groestl
+// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
 #include <cuda.h>
 #include "cuda_runtime.h"
@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 // it's unfortunate that this is a compile time constant.
 #define MAXWELL_OR_FERMI 1
 // aus cpu-miner.c
 extern int device_map[8];
@ -18,361 +15,80 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 // diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props;
+static cudaDeviceProp props[8];
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 extern uint32_t *d_resultNonce[8];
 __constant__ uint32_t groestlcoin_gpu_msg[32];
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+// 64 Register Variante für Compute 3.0
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#include "groestl_functions_quad.cu"
-
+#include "bitslice_transformations_quad.cu"
 #define PC32up(j, r)   ((uint32_t)((j) + (r)))
 #define PC32dn(j, r)   0
 #define QC32up(j, r)   0xFFFFFFFF
 #define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
 #define B32_0(x)    __byte_perm(x, 0, 0x4440)
 //((x) & 0xFF)
 #define B32_1(x)    __byte_perm(x, 0, 0x4441)
 //(((x) >> 8) & 0xFF)
 #define B32_2(x)    __byte_perm(x, 0, 0x4442)
 //(((x) >> 16) & 0xFF)
 #define B32_3(x)    __byte_perm(x, 0, 0x4443)
 //((x) >> 24)
 #if MAXWELL_OR_FERMI
 #define USE_SHARED 1
 // Maxwell and Fermi cards get the best speed with SHARED access it seems.
 #if USE_SHARED
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
 #else
 #define T0up(x) tex1Dfetch(t0up1, x)
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) tex1Dfetch(t1dn1, x)
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) tex1Dfetch(t2dn1, x)
 #define T3up(x) tex1Dfetch(t3up1, x)
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 #else
 #define USE_SHARED 1
 // a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 texture<unsigned int, 1, cudaReadModeElementType> t0up1;
 texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t1up1;
 texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t2up1;
 texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t3up1;
 texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
 extern uint32_t T0up_cpu[];
 extern uint32_t T0dn_cpu[];
 extern uint32_t T1up_cpu[];
 extern uint32_t T1dn_cpu[];
 extern uint32_t T2up_cpu[];
 extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 #define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-
+__global__ void __launch_bounds__(256, 4)
-__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
+ groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
 	uint32_t t[32];
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
 		switch(r)
 {
-			case 0:
+    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-#pragma unroll 16
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
+    if (thread < threads)
 			case 1:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
 			case 2:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
 			case 3:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
 			case 4:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
 			case 5:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
 			case 6:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
 			case 7:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
 			case 8:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
 			case 9:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
 			case 10:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
 			case 11:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
 			case 12:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
 			case 13:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
 		}
        // RBTT
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
    {
-            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
+        // GROESTL
-            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
+        uint32_t paddedInput[8];
-            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
+#pragma unroll 8
-            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
+        for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
+        uint32_t nounce = startNounce + thread;
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+        if ((threadIdx.x % 4) == 3)
            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
+        uint32_t msgBitsliced[8];
-                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
+        to_bitslice_quad(paddedInput, msgBitsliced);
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
-__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
+        uint32_t state[8];
        for (int round=0; round<2; round++)
        {
-//#pragma unroll 14
+            groestl512_progressMessage_quad(state, msgBitsliced);
 	for(int r=0;r<14;r++)
 	{
 		uint32_t t[32];
-		switch(r)
+            if (round < 1)
            {
-			case 0:
+                // Verkettung zweier Runden inclusive Padding.
-	#pragma unroll 16
+                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
+                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
-			case 1:
+                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
-	#pragma unroll 16
+                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
+                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
-			case 2:
+                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
-	#pragma unroll 16
+                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
-				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
+                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
 			case 3:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
 			case 4:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
 			case 5:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
 			case 6:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
 			case 7:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
 			case 8:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
 			case 9:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
 			case 10:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
 			case 11:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
 			case 12:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
 			case 13:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
 		}
        // RBTT
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
            }
        }
 #if USE_SHARED
 __global__ void  /* __launch_bounds__(256) */
 #else
 __global__ void 
 #endif
- groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-{
+        uint32_t out_state[16];
-#if USE_SHARED
+        from_bitslice_quad(state, out_state);
 	extern __shared__ char mixtabs[];
-	if (threadIdx.x < 256)
+        if (threadIdx.x % 4 == 0)
 	{
 		*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
 	}
 	__syncthreads();
 #endif
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
        {
 		// GROESTL
 		uint32_t message[32];
 		uint32_t state[32];
 #pragma unroll 32
 		for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
 		uint32_t nounce = startNounce + thread;
 		message[19] = SWAB32(nounce);
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] = message[u];
 		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
 		groestlcoin_perm_P(state, mixtabs);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
 		groestlcoin_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 #pragma unroll 32
 		for(int u=0;u<32;u++) message[u] = state[u];
 #if USE_SHARED
 		groestlcoin_perm_P(message, mixtabs);
 #else
 		groestlcoin_perm_P(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 		////
 		//// 2. Runde groestl
 		////
 #pragma unroll 16
 		for(int k=0;k<16;k++) message[k] = state[k + 16];
 #pragma unroll 14
 		for(int k=1;k<15;k++)
 			message[k+16] = 0;
 		message[16] = 0x80;
 		message[31] = 0x01000000;
 #pragma unroll 32
 		for(int u=0;u<32;u++)
 			state[u] = message[u];
 		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
 		groestlcoin_perm_P(state, mixtabs);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
 		groestlcoin_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 #pragma unroll 32
 		for(int u=0;u<32;u++) message[u] = state[u];
 #if USE_SHARED
 		groestlcoin_perm_P(message, mixtabs);
 #else
 		groestlcoin_perm_P(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 		// kopiere Ergebnis
            int i, position = -1;
            bool rc = true;
    #pragma unroll 8
            for (i = 7; i >= 0; i--) {
-			if (state[i+16] > pTarget[i]) {
+                if (out_state[i] > pTarget[i]) {
                    if(position < i) {
                        position = i;
                        rc = false;
                    }
                 }
-	 		if (state[i+16] < pTarget[i]) {
+                 if (out_state[i] < pTarget[i]) {
                    if(position < i) {
                        position = i;
                        rc = true;
@ -385,33 +101,14 @@ __global__ void
                    resNounce[0] = nounce;
        }
    }
-
+}
 #define texDef(texname, texmem, texsource, texsize) \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
 // Setup-Funktionen
 __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 {
    cudaSetDevice(device_map[thr_id]);
-	cudaGetDeviceProperties(&props, device_map[thr_id]);
+    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 	// Texturen mit obigem Makro initialisieren
 	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
 	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
 	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
 	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
 	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
 	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
 	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
 	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
    // Speicher für Gewinner-Nonce belegen
    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
@ -446,25 +143,21 @@ __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
-	// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
+    int threadsperblock = 256;
-	// alle anderen mit 512 Threads.
+
-	int threadsperblock = (props.major >= 3) ? 768 : 512;
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
    int factor = 4;
        // berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);
    // Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
    size_t shared_size = 0;
 #endif
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	//fprintf(stderr, "ThrID: %d\n", thr_id);
    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+    groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
    MyStreamSynchronize(NULL, 0, thr_id);
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@ -1,4 +1,4 @@
-// Auf Myriadcoin spezialisierte Version von Groestl
+// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
 #include <cuda.h>
 #include "cuda_runtime.h"
@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 // it's unfortunate that this is a compile time constant.
 #define MAXWELL_OR_FERMI 1
 // aus cpu-miner.c
 extern int device_map[8];
@ -22,15 +19,18 @@ typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 // diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props;
+static cudaDeviceProp props[8];
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 uint32_t *d_outputHashes[8];
 extern uint32_t *d_resultNonce[8];
 __constant__ uint32_t myriadgroestl_gpu_msg[32];
 // muss expandiert werden
 __constant__ uint32_t myr_sha256_gpu_constantTable[64];
 __constant__ uint32_t myr_sha256_gpu_constantTable2[64];
 __constant__ uint32_t myr_sha256_gpu_hashTable[8];
 uint32_t myr_sha256_cpu_hashTable[] = { 
@ -46,6 +46,22 @@ uint32_t myr_sha256_cpu_constantTable[] = {
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
 };
 uint32_t myr_sha256_cpu_w2Table[] = {
    0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 
    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
    0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
    0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
    0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
    0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
    0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
    0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 };
 // 64 Register Variante für Compute 3.0
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"
 #define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 #if __CUDA_ARCH__ < 350 
    // Kepler (Compute 3.0)
    #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
@ -61,8 +77,6 @@ uint32_t myr_sha256_cpu_constantTable[] = {
 #define s0(x)            (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
 #define s1(x)            (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
 {
    uint32_t W1[16];
@ -99,9 +113,7 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
    }
 // Progress W2...W3
-#pragma unroll 3
+////// PART 1
 	for(int k=0;k<3;k++)
 	{
 #pragma unroll 2
    for(int j=0;j<2;j++)
        W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
@ -120,7 +132,7 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
    for(int j=0;j<16;j++)
    {
        uint32_t T1, T2;
-			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j];
        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
        #pragma unroll 7
@ -129,46 +141,35 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
        regs[4] += T1;
    }
-#pragma unroll 16
+////// PART 2
-		for(int j=0;j<16;j++)
+#pragma unroll 2
-			W1[j] = W2[j];
+    for(int j=0;j<2;j++)
-	}
+        W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j];
-
+#pragma unroll 5
-#pragma unroll 8
+    for(int j=2;j<7;j++)
-	for(int k=0;k<8;k++)
+        W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j];
 		hash[k] += regs[k];
 	/////
 	///// Zweite Runde (wegen Msg-Padding)
 	/////
 #pragma unroll 8
-	for(int k=0;k<8;k++)
+    for(int j=7;j<15;j++)
-		regs[k] = hash[k];
+        W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j];
-	W1[0] = SWAB32(0x80);
+    W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
 #pragma unroll 14
 	for(int k=1;k<15;k++)
 		W1[k] = 0;
 	W1[15] = 512;
-// Progress W1
+    // Rundenfunktion
 #pragma unroll 16
    for(int j=0;j<16;j++)
    {
        uint32_t T1, T2;
-		T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j];
        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
        #pragma unroll 7
-		for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+        for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
        regs[0] = T1 + T2;
        regs[4] += T1;
    }
-// Progress W2...W3
+////// PART 3
 #pragma unroll 3
 	for(int k=0;k<3;k++)
 	{
 #pragma unroll 2
    for(int j=0;j<2;j++)
        W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
@ -187,7 +188,7 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
    for(int j=0;j<16;j++)
    {
        uint32_t T1, T2;
-			T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
+        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j];
        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
        #pragma unroll 7
@ -196,9 +197,29 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
        regs[4] += T1;
    }
-#pragma unroll 16
+#pragma unroll 8
-		for(int j=0;j<16;j++)
+    for(int k=0;k<8;k++)
-			W1[j] = W2[j];
+        hash[k] += regs[k];
    /////
    ///// Zweite Runde (wegen Msg-Padding)
    /////
 #pragma unroll 8
    for(int k=0;k<8;k++)
        regs[k] = hash[k];
 // Progress W1
 #pragma unroll 64
    for(int j=0;j<64;j++)
    {
        uint32_t T1, T2;
        T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
        T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
        #pragma unroll 7
        for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
        regs[0] = T1 + T2;
        regs[4] += T1;
    }
 #pragma unroll 8
@ -212,293 +233,55 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
        message[k] = SWAB32(hash[k]);
 }
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+__global__ void __launch_bounds__(256, 4)
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+ myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer)
 #define PC32up(j, r)   ((uint32_t)((j) + (r)))
 #define PC32dn(j, r)   0
 #define QC32up(j, r)   0xFFFFFFFF
 #define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
 #define B32_0(x)    __byte_perm(x, 0, 0x4440)
 //((x) & 0xFF)
 #define B32_1(x)    __byte_perm(x, 0, 0x4441)
 //(((x) >> 8) & 0xFF)
 #define B32_2(x)    __byte_perm(x, 0, 0x4442)
 //(((x) >> 16) & 0xFF)
 #define B32_3(x)    __byte_perm(x, 0, 0x4443)
 //((x) >> 24)
 #if MAXWELL_OR_FERMI
 #define USE_SHARED 1
 // Maxwell and Fermi cards get the best speed with SHARED access it seems.
 #if USE_SHARED
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
 #else
 #define T0up(x) tex1Dfetch(t0up1, x)
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) tex1Dfetch(t1dn1, x)
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) tex1Dfetch(t2dn1, x)
 #define T3up(x) tex1Dfetch(t3up1, x)
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 #else
 #define USE_SHARED 1
 // a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 texture<unsigned int, 1, cudaReadModeElementType> t0up1;
 texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t1up1;
 texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t2up1;
 texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t3up1;
 texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
 extern uint32_t T0up_cpu[];
 extern uint32_t T0dn_cpu[];
 extern uint32_t T1up_cpu[];
 extern uint32_t T1dn_cpu[];
 extern uint32_t T2up_cpu[];
 extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 __device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs)
 {
-	uint32_t t[32];
+    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
-
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
-//#pragma unroll 14
+    if (thread < threads)
 	for(int r=0;r<14;r++)
 	{
 		switch(r)
    {
-			case 0:
+        // GROESTL
-#pragma unroll 16
+        uint32_t paddedInput[8];
-				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
+#pragma unroll 8
-			case 1:
+        for(int k=0;k<8;k++) paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4];
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
 			case 2:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
 			case 3:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
 			case 4:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
 			case 5:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
 			case 6:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
 			case 7:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
 			case 8:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
 			case 9:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
 			case 10:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
 			case 11:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
 			case 12:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
 			case 13:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
 		}
-        // RBTT
+        uint32_t nounce = startNounce + thread;
-#pragma unroll 16
+        if ((threadIdx.x % 4) == 3)
-        for(int k=0;k<32;k+=2)
+            paddedInput[4] = SWAB32(nounce);  // 4*4+3 = 19
        {
            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
+        uint32_t msgBitsliced[8];
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+        to_bitslice_quad(paddedInput, msgBitsliced);
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
+        uint32_t state[8];
                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
-__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs)
+        groestl512_progressMessage_quad(state, msgBitsliced);
 {	
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
 		uint32_t t[32];
-		switch(r)
+        uint32_t out_state[16];
-		{
+        from_bitslice_quad(state, out_state);
 			case 0:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
 			case 1:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
 			case 2:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
 			case 3:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
 			case 4:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
 			case 5:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
 			case 6:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
 			case 7:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
 			case 8:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
 			case 9:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
 			case 10:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
 			case 11:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
 			case 12:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
 			case 13:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
 		}
-        // RBTT
+        if ((threadIdx.x & 0x03) == 0)
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
-            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
+            uint32_t *outpHash = &hashBuffer[16 * thread];
-            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
+#pragma unroll 16
-            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
+            for(int k=0;k<16;k++) outpHash[k] = out_state[k];
            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
 __global__ void
-myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
+ myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
 	if (threadIdx.x < 256)
 	{
 		*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
 		*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
 	}
 	__syncthreads();
 #endif
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
 		// GROESTL
 		uint32_t message[32];
 		uint32_t state[32];
 #pragma unroll 32
 		for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k];
        uint32_t nounce = startNounce + thread;
 		message[19] = SWAB32(nounce);
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] = message[u];
 		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
 		myriadgroestl_perm_P(state, mixtabs);
 		state[31] ^= 0x20000;
 		myriadgroestl_perm_Q(message, mixtabs);
 #else
 		myriadgroestl_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		myriadgroestl_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 #pragma unroll 32
 		for(int u=0;u<32;u++) message[u] = state[u];
 #if USE_SHARED
 		myriadgroestl_perm_P(message, mixtabs);
 #else
 		myriadgroestl_perm_P(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
        uint32_t out_state[16];
        uint32_t *inpHash = &hashBuffer[16 * thread];
 #pragma unroll 16
-		for(int u=0;u<16;u++) out_state[u] = state[u+16];
+        for (int i=0; i < 16; i++)
            out_state[i] = inpHash[i];
        myriadgroestl_gpu_sha256(out_state);
        int i, position = -1;
@ -526,16 +309,6 @@ myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
    }
 }
 #define texDef(texname, texmem, texsource, texsize) \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
 // Setup-Funktionen
 __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
 {
@ -549,20 +322,22 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
                        myr_sha256_cpu_constantTable,
                        sizeof(uint32_t) * 64 );
-    cudaGetDeviceProperties(&props, device_map[thr_id]);
+    // zweite CPU-Tabelle bauen und auf die GPU laden
    uint32_t temp[64];
    for(int i=0;i<64;i++)
        temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i];
    cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2,
                        temp,
                        sizeof(uint32_t) * 64 );
-	// Texturen mit obigem Makro initialisieren
+    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
 	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
 	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
 	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
 	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
 	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
 	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
 	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
    // Speicher für Gewinner-Nonce belegen
    cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
    // Speicher für temporäreHashes
    cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads); 
 }
 __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -594,25 +369,23 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn
 __host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
-	// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
+    int threadsperblock = 256;
 	// alle anderen mit 512 Threads.
 	int threadsperblock = (props.major >= 3) ? 768 : 512;
-    // berechne wie viele Thread Blocks wir brauchen
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    dim3 block(threadsperblock);
+    const int factor=4;
    // Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
    size_t shared_size = 0;
 #endif
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    //fprintf(stderr, "ThrID: %d\n", thr_id);
    cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-    myriadgroestl_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
+    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);
    myriadgroestl_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_outputHashes[thr_id]);
    dim3 grid2((threads + threadsperblock-1)/threadsperblock);
    myriadgroestl_gpu_hash_quad2<<<grid2, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
    MyStreamSynchronize(NULL, 0, thr_id);
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -85,7 +85,6 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/groestl_functions_quad.cu
+++ b/groestl_functions_quad.cu
@ -0,0 +1,315 @@
 __device__ __forceinline__ void G256_Mul2(uint32_t *regs)
 {
    uint32_t tmp = regs[7];
    regs[7] = regs[6];
    regs[6] = regs[5];
    regs[5] = regs[4];
    regs[4] = regs[3] ^ tmp;
    regs[3] = regs[2] ^ tmp;
    regs[2] = regs[1];
    regs[1] = regs[0] ^ tmp;
    regs[0] = tmp;
 }
 __device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
 {
    x0 = ~x0;
    x1 = ~x1;
    x2 = ~x2;
    x3 = ~x3;
    x4 = ~x4;
    x5 = ~x5;
    x6 = ~x6;
    x7 = ~x7;
    if ((threadIdx.x & 0x03) == 3) {
        x0 ^= ((- (round & 0x01)    ) & 0xFFFF0000);
        x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000);
        x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000);
        x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000);
        x4 ^= 0xAAAA0000;
        x5 ^= 0xCCCC0000;
        x6 ^= 0xF0F00000;
        x7 ^= 0xFF000000;
    }
 }
 __device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
 {
    if ((threadIdx.x & 0x03) == 0)
    {
        x4 ^= 0xAAAA;
        x5 ^= 0xCCCC;
        x6 ^= 0xF0F0;
        x7 ^= 0xFF00;
        x0 ^= ((- (round & 0x01)    ) & 0xFFFF);
        x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF);
        x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF);
        x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF);
    }
 }
 __device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
                                       uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
 {
    uint32_t t0,t1,t2;
    t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
    t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
    t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
    t0 = (x2^x3) & (y2^y3);
    x3 = (x3 & y3) ^ t0 ^ t1;
    x2 = (x2 & y2) ^ t0 ^ t2;
    t0 = (x0^x1) & (y0^y1);
    x1 = (x1 & y1) ^ t0 ^ t1;
    x0 = (x0 & y0) ^ t0 ^ t2;
 }
 __device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
 {
    uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
    t3 = x7;
    t2 = x6;
    t1 = x5;
    t0 = x4;
    G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
    a = (x4 ^ x0);
    t0 ^= a;
    t2 ^= (x7 ^ x3) ^ (x5 ^ x1); 
    t1 ^= (x5 ^ x1) ^ a;
    t3 ^= (x6 ^ x2) ^ a;
    b = t0 ^ t1;
    t4 = (t2 ^ t3) & b;
    a = t4 ^ t3 ^ t1;
    t5 = (t3 & t1) ^ a;
    t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
    t4 = (t5 ^ t6) & b;
    t1 = (t6 & t1) ^ t4;
    t0 = (t5 & t0) ^ t4;
    t4 = (t5 ^ t6) & (t2^t3);
    t3 = (t6 & t3) ^ t4;
    t2 = (t5 & t2) ^ t4;
    G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
    G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
 }
 __device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
 {
    uint32_t t0, t1;
    t0 = x0 ^ x1 ^ x2;
    t1 = x5 ^ x6;
    x2 = t0 ^ t1 ^ x7;
    x6 = t0 ^ x3 ^ x6;
    x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;    
    x4 = x0 ^ x4 ^ t1;
    x2 = t0 ^ t1 ^ x7;
    x1 = x0 ^ x1 ^ t1;
    x7 = x0 ^ t1 ^ x7;
    x5 = x0 ^ t1;
 }
 __device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
 {
    uint32_t t0,t2,t3,t5;
    x1 ^= x4;
    t0 = x1 ^ x6;
    x1 ^= x5;
    t2 = x0 ^ x2;
    x2 = x3 ^ x5;
    t2 ^= x2 ^ x6;
    x2 ^= x7;
    t3 = x4 ^ x2 ^ x6;
    t5 = x0 ^ x6;
    x4 = x3 ^ x7;
    x0 = x3 ^ x5;
    x6 = t0;    
    x3 = t2;
    x7 = t3;    
    x5 = t5;    
 }
 __device__ __forceinline__ void sbox_quad(uint32_t *r)
 {
    transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
    G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
    transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
    r[0] = ~r[0];
    r[1] = ~r[1];
    r[5] = ~r[5];
    r[6] = ~r[6];
 }
 __device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
 {
    uint32_t t0,t1;
    int tpos = threadIdx.x & 0x03;
    int shift1 = tpos << 1;
    int shift2 = shift1+1 + ((tpos == 3)<<2);
    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
    x0 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
    x1 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
    x2 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
    x3 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
    x4 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
    x5 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
    x6 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
    x7 = __byte_perm(t0, t1, 0x5410);
 }
 __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
 {
    uint32_t t0,t1;
    int tpos = threadIdx.x & 0x03;
    int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
    int shift2 = shift1+2 + ((tpos == 1)<<2);
    t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
    x0 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
    x1 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
    x2 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
    x3 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
    x4 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
    x5 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
    x6 = __byte_perm(t0, t1, 0x5410);
    t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
    t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
    x7 = __byte_perm(t0, t1, 0x5410);
 }
 __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
 {
 #define SHIFT64_16(hi, lo)    __byte_perm(lo, hi, 0x5432)
 #define A(v, u)             __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
 #define S(idx, l)            SHIFT64_16( A(idx, (l+1)), A(idx, l) )
 #define DOUBLE_ODD(i, bc)        ( S(i, (bc)) ^ A(i, (bc) + 1) )
 #define DOUBLE_EVEN(i, bc)        ( S(i, (bc)) ^ A(i, (bc)    ) )
 #define SINGLE_ODD(i, bc)        ( S(i, (bc)) )
 #define SINGLE_EVEN(i, bc)        ( A(i, (bc)) )
    uint32_t b[8];
 #pragma unroll 8
    for(int i=0;i<8;i++)
        b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
    G256_Mul2(b);
 #pragma unroll 8
    for(int i=0;i<8;i++)
        b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
    G256_Mul2(b);
 #pragma unroll 8
    for(int i=0;i<8;i++)
        r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
 #undef S
 #undef A
 #undef SHIFT64_16
 #undef t
 #undef X
 }
 __device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r)
 {
    for(int round=0;round<14;round++)
    {
        G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
        sbox_quad(r);
        G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
        G256_MixFunction_quad(r);
    }
 }
 __device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r)
 {    
    for(int round=0;round<14;round++)
    {
        G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
        sbox_quad(r);
        G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
        G256_MixFunction_quad(r);
    }
 }
 __device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
 {
 #pragma unroll 8
    for(int u=0;u<8;u++) state[u] = message[u];
    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
    groestl512_perm_P_quad(state);
    if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
    groestl512_perm_Q_quad(message);
 #pragma unroll 8
    for(int u=0;u<8;u++) state[u] ^= message[u];
 #pragma unroll 8
    for(int u=0;u<8;u++) message[u] = state[u];
    groestl512_perm_P_quad(message);
 #pragma unroll 8
    for(int u=0;u<8;u++) state[u] ^= message[u];
 }
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@ -46,7 +46,6 @@ static void groestlhash(void *state, const void *input)
    sph_groestl512_context     ctx_groestl[2];
    static unsigned char pblank[1];
 	int ii;
    uint32_t mask = 8;
    uint32_t zero = 0;
@ -66,16 +65,18 @@ static void groestlhash(void *state, const void *input)
    memcpy(state, hashB, 32);
 }
-
+extern bool opt_benchmark;
 extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
    uint32_t max_nonce, unsigned long *hashes_done)
 {    
    if (opt_benchmark)
        ((uint32_t*)ptarget)[7] = 0x000000ff;
    uint32_t start_nonce = pdata[19]++;
    const uint32_t Htarg = ptarget[7];
    const uint32_t throughPut = 4096 * 128;
    //const uint32_t throughPut = 1;
 	int i;
    uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
    // init
@ -87,8 +88,6 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
    }
    // Endian Drehung ist notwendig
 	//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
 	//pdata = (uint32_t*)testdata;
    uint32_t endiandata[32];
    for (int kk=0; kk < 32; kk++)
        be32enc(&endiandata[kk], pdata[kk]);
@ -102,35 +101,6 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
        groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
 		/*
 		{
 			for(i=0;i<throughPut;i++)
 			{
 				uint32_t tmpHash[8];
 				endiandata[19] = SWAP32(pdata[19]);
 				groestlhash(tmpHash, endiandata);
 				int ii;
 				printf("result GPU: ");
 				for (ii=0; ii < 32; ii++)
 				{
 					printf ("%.2x",((uint8_t*)&outputHash[8*i])[ii]);
 				};
 				printf ("\n");	
 				groestlhash(tmpHash, endiandata);
 				printf("result CPU: ");
 				for (ii=0; ii < 32; ii++)
 				{
 					printf ("%.2x",((uint8_t*)tmpHash)[ii]);
 				};
 			}
 			exit(0);
 		}		
 		*/
        if(foundNounce < 0xffffffff)
        {
            uint32_t tmpHash[8];
@ -147,21 +117,6 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
            }
            foundNounce = 0xffffffff;
 			/*
 			int ii;
 			printf("result GPU: ");
 			for (ii=0; ii < 32; ii++)
 			{
 				printf ("%.2x",((uint8_t*)&outputHash[0])[ii]);
 			};
 			printf ("\n");	
 			printf("result CPU: ");
 			for (ii=0; ii < 32; ii++)
 			{
 				printf ("%.2x",((uint8_t*)tmpHash)[ii]);
 			};
 			printf ("\n");	
 			*/
        }
        if (pdata[19] + throughPut < pdata[19])
--- a/heavy/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@ -269,8 +269,6 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE == 80)
 		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 84)
--- a/heavy/cuda_combine.cu
+++ b/heavy/cuda_combine.cu
@ -141,8 +141,6 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
 	// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
--- a/heavy/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@ -824,8 +824,6 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE == 84)
 		groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
 	else if (BLOCKSIZE == 80)
--- a/heavy/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@ -416,8 +416,6 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
    size_t shared_size = 0;
 #endif
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
--- a/heavy/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@ -279,7 +279,6 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE==84)
 		keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE==80)
--- a/heavy/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@ -271,7 +271,6 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	if (BLOCKSIZE == 84)
 		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 	else if (BLOCKSIZE == 80) {
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@ -35,17 +35,19 @@ static void myriadhash(void *state, const void *input)
    memcpy(state, hashB, 32);
 }
-
+extern bool opt_benchmark;
 extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {	
    if (opt_benchmark)
        ((uint32_t*)ptarget)[7] = 0x000000ff;
 	uint32_t start_nonce = pdata[19]++;
 	const uint32_t throughPut = 128 * 1024;
-//	const uint32_t throughPut = 1;
+
 	uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -175,7 +175,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x00000f;
--- a/quark/cuda_bmw512.cu
+++ b/quark/cuda_bmw512.cu
@ -447,8 +447,6 @@ __host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
@ -464,8 +462,6 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/quark/cuda_jh512.cu
+++ b/quark/cuda_jh512.cu
@ -350,8 +350,6 @@ __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/quark/cuda_quark_blake512.cu
+++ b/quark/cuda_quark_blake512.cu
@ -406,8 +406,6 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
@ -425,8 +423,6 @@ __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
--- a/quark/cuda_quark_checkhash.cu
+++ b/quark/cuda_quark_checkhash.cu
@ -89,8 +89,6 @@ __host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t star
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -1,4 +1,4 @@
-// Auf QuarkCoin spezialisierte Version von Groestl
+// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
 #include <cuda.h>
 #include "cuda_runtime.h"
@ -7,9 +7,6 @@
 #include <stdio.h>
 #include <memory.h>
 // it's unfortunate that this is a compile time constant.
 #define MAXWELL_OR_FERMI 1
 // aus cpu-miner.c
 extern int device_map[8];
@ -18,353 +15,137 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props[8];
-#define SPH_C32(x)    ((uint32_t)(x ## U))
+// 64 Register Variante für Compute 3.0
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+#include "groestl_functions_quad.cu"
-
+#include "bitslice_transformations_quad.cu"
 #define PC32up(j, r)   ((uint32_t)((j) + (r)))
 #define PC32dn(j, r)   0
 #define QC32up(j, r)   0xFFFFFFFF
 #define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
 #define B32_0(x)    __byte_perm(x, 0, 0x4440)
 //((x) & 0xFF)
 #define B32_1(x)    __byte_perm(x, 0, 0x4441)
 //(((x) >> 8) & 0xFF)
 #define B32_2(x)    __byte_perm(x, 0, 0x4442)
 //(((x) >> 16) & 0xFF)
 #define B32_3(x)    __byte_perm(x, 0, 0x4443)
 //((x) >> 24)
 #if MAXWELL_OR_FERMI
 #define USE_SHARED 1
 // Maxwell and Fermi cards get the best speed with SHARED access it seems.
 #if USE_SHARED
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
 #define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
 #else
 #define T0up(x) tex1Dfetch(t0up1, x)
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) tex1Dfetch(t1dn1, x)
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) tex1Dfetch(t2dn1, x)
 #define T3up(x) tex1Dfetch(t3up1, x)
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 #else
 #define USE_SHARED 1
 // a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
 #define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 #define T0dn(x) tex1Dfetch(t0dn1, x)
 #define T1up(x) tex1Dfetch(t1up1, x)
 #define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 #define T2up(x) tex1Dfetch(t2up1, x)
 #define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 #define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 #define T3dn(x) tex1Dfetch(t3dn1, x)
 #endif
 texture<unsigned int, 1, cudaReadModeElementType> t0up1;
 texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t1up1;
 texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t2up1;
 texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
 texture<unsigned int, 1, cudaReadModeElementType> t3up1;
 texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
 extern uint32_t T0up_cpu[];
 extern uint32_t T0dn_cpu[];
 extern uint32_t T1up_cpu[];
 extern uint32_t T1dn_cpu[];
 extern uint32_t T2up_cpu[];
 extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 __device__ __forceinline__ void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
 {
    uint32_t t[32];
-//#pragma unroll 14
+__global__ void __launch_bounds__(256, 4)
-    for(int r=0;r<14;r++)
+ quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
-        switch(r)
+    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
    int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
    if (thread < threads)
    {
-            case 0:
+        // GROESTL
-#pragma unroll 16
+        uint32_t message[8];
-                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 0); break;
+        uint32_t state[8];
            case 1:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 1); break;
            case 2:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 2); break;
            case 3:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 3); break;
            case 4:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 4); break;
            case 5:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 5); break;
            case 6:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 6); break;
            case 7:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 7); break;
            case 8:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 8); break;
            case 9:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 9); break;
            case 10:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 10); break;
            case 11:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 11); break;
            case 12:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 12); break;
            case 13:
 #pragma unroll 16
                for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 13); break;
        }
-        // RBTT
+        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
            uint32_t t0_0 = B32_0(a[(k     ) & 0x1f]), t9_0  = B32_0(a[(k +  9) & 0x1f]);
            uint32_t t2_1 = B32_1(a[(k +  2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
            uint32_t t4_2 = B32_2(a[(k +  4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
            uint32_t t6_3 = B32_3(a[(k +  6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
-            t[k + 0] =  T0up( t0_0 ) ^ T1up(  t2_1 ) ^ T2up(  t4_2 ) ^ T3up(  t6_3 ) ^ 
+        int hashPosition = nounce - startNounce;
-                        T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
+        uint32_t *inpHash = &g_hash[hashPosition<<4];
-            t[k + 1] =  T0dn( t0_0 ) ^ T1dn(  t2_1 ) ^ T2dn(  t4_2 ) ^ T3dn(  t6_3 ) ^ 
+#pragma unroll 4
-                        T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
+        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
-        }
+#pragma unroll 4
-#pragma unroll 32
+        for(int k=4;k<8;k++) message[k] = 0;
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
-__device__ __forceinline__ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
+        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
-{    
+        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
 //#pragma unroll 14
    for(int r=0;r<14;r++)
    {
        uint32_t t[32];
-        switch(r)
+        uint32_t msgBitsliced[8];
-        {
+        to_bitslice_quad(message, msgBitsliced);
            case 0:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 0); a[(k*2)+1] ^= QC32dn(k<< 4, 0);} break;
            case 1:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 1); a[(k*2)+1] ^= QC32dn(k<< 4, 1);} break;
            case 2:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 2); a[(k*2)+1] ^= QC32dn(k<< 4, 2);} break;
            case 3:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 3); a[(k*2)+1] ^= QC32dn(k<< 4, 3);} break;
            case 4:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 4); a[(k*2)+1] ^= QC32dn(k<< 4, 4);} break;
            case 5:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 5); a[(k*2)+1] ^= QC32dn(k<< 4, 5);} break;
            case 6:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 6); a[(k*2)+1] ^= QC32dn(k<< 4, 6);} break;
            case 7:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 7); a[(k*2)+1] ^= QC32dn(k<< 4, 7);} break;
            case 8:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 8); a[(k*2)+1] ^= QC32dn(k<< 4, 8);} break;
            case 9:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 9); a[(k*2)+1] ^= QC32dn(k<< 4, 9);} break;
            case 10:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 10); a[(k*2)+1] ^= QC32dn(k<< 4, 10);} break;
            case 11:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 11); a[(k*2)+1] ^= QC32dn(k<< 4, 11);} break;
            case 12:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 12); a[(k*2)+1] ^= QC32dn(k<< 4, 12);} break;
            case 13:
    #pragma unroll 16
                for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 13); a[(k*2)+1] ^= QC32dn(k<< 4, 13);} break;
        }
-        // RBTT
+        groestl512_progressMessage_quad(state, msgBitsliced);
 #pragma unroll 16
        for(int k=0;k<32;k+=2)
        {
            uint32_t t2_0  = B32_0(a[(k +  2) & 0x1f]), t1_0  = B32_0(a[(k +  1) & 0x1f]);
            uint32_t t6_1  = B32_1(a[(k +  6) & 0x1f]), t5_1  = B32_1(a[(k +  5) & 0x1f]);
            uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2  = B32_2(a[(k +  9) & 0x1f]);
            uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
-            t[k + 0] =  T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ 
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-                        T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn(  t9_2 ) ^ T3dn( t13_3 );
+        uint32_t *outpHash = &g_hash[hashPosition<<4];
        uint32_t hash[16];
        from_bitslice_quad(state, hash);
-            t[k + 1] =  T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ 
+        if ((threadIdx.x & 0x03) == 0)
-                        T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up(  t9_2 ) ^ T3up( t13_3 );
+        {
 #pragma unroll 16
            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
 #pragma unroll 32
        for(int k=0;k<32;k++)
            a[k] = t[k];
    }
 }
 __global__ void  quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
 #if USE_SHARED
    extern __shared__ char mixtabs[];
-    if (threadIdx.x < 256)
+__global__ void __launch_bounds__(256, 4)
 quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
-        *((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
        *((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
        *((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
        *((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
        *((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
        *((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
        *((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
        *((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
    }
    __syncthreads();
 #endif
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
        // GROESTL
-        uint32_t message[32];
+        uint32_t message[8];
-        uint32_t state[32];
+        uint32_t state[8];
        uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
        int hashPosition = nounce - startNounce;
-        uint32_t *inpHash = &g_hash[16 * hashPosition];
+        uint32_t *inpHash = &g_hash[hashPosition<<4];
-#pragma unroll 16
+#pragma unroll 4
-        for(int k=0;k<16;k++) message[k] = inpHash[k];
+        for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
-#pragma unroll 14
+#pragma unroll 4
-        for(int k=1;k<15;k++)
+        for(int k=4;k<8;k++) message[k] = 0;
            message[k+16] = 0;
        message[16] = 0x80;
        message[31] = 0x01000000;
 #pragma unroll 32
        for(int u=0;u<32;u++) state[u] = message[u];
        state[31] ^= 0x20000;
        // Perm
 #if USE_SHARED
        quark_groestl512_perm_P(state, mixtabs);
        state[31] ^= 0x20000;
        quark_groestl512_perm_Q(message, mixtabs);
 #else
        quark_groestl512_perm_P(state, NULL);
        state[31] ^= 0x20000;
        quark_groestl512_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
        for(int u=0;u<32;u++) state[u] ^= message[u];
 #pragma unroll 32
        for(int u=0;u<32;u++) message[u] = state[u];
 #if USE_SHARED
        quark_groestl512_perm_P(message, mixtabs);
 #else
        quark_groestl512_perm_P(message, NULL);
 #endif
 #pragma unroll 32
        for(int u=0;u<32;u++) state[u] ^= message[u];
        // Erzeugten Hash rausschreiben
        uint32_t *outpHash = &g_hash[16 * hashPosition];
-#pragma unroll 16
+        if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
-        for(int k=0;k<16;k++) outpHash[k] = state[k+16];
+        if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
        uint32_t msgBitsliced[8];
        to_bitslice_quad(message, msgBitsliced);
        for (int round=0; round<2; round++)
        {
            groestl512_progressMessage_quad(state, msgBitsliced);
            if (round < 1)
            {
                // Verkettung zweier Runden inclusive Padding.
                msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));
                msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
                msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
                msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
                msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
                msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
                msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
                msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));
            }
        }
-#define texDef(texname, texmem, texsource, texsize) \
+        // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
-    unsigned int *texmem; \
+        uint32_t *outpHash = &g_hash[hashPosition<<4];
-    cudaMalloc(&texmem, texsize); \
+        uint32_t hash[16];
-    cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+        from_bitslice_quad(state, hash);
-    texname.normalized = 0; \
+
-    texname.filterMode = cudaFilterModePoint; \
+        if ((threadIdx.x & 0x03) == 0)
-    texname.addressMode[0] = cudaAddressModeClamp; \
+        {
-    { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+#pragma unroll 16
-      cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+            for(int k=0;k<16;k++) outpHash[k] = hash[k];
        }
    }
 }
 // Setup-Funktionen
 __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
 {
    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 // Texturen mit obigem Makro initialisieren
    texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
    texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
    texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
    texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
    texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
    texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
    texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
    texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
 }
 __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    // Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
+    int threadsperblock = 256;
-    // alle anderen mit 512 Threads.
+
-    int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
    const int factor = 4;
    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);
    // Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
    size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
    size_t shared_size = 0;
 #endif
-//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+    quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    //fprintf(stderr, "ThrID: %d\n", thr_id);
    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
    MyStreamSynchronize(NULL, order, thr_id);
@ -372,25 +153,20 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
 __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    // Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
+    int threadsperblock = 256;
-    // alle anderen mit 512 Threads.
+
-    int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
+    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
    const int factor = 4;
    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
    dim3 block(threadsperblock);
    // Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
    size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
    size_t shared_size = 0;
 #endif
-//  fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+    quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    //fprintf(stderr, "ThrID: %d\n", thr_id);
    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
    // Strategisches Sleep Kommando zur Senkung der CPU Last
    MyStreamSynchronize(NULL, order, thr_id);
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@ -175,8 +175,6 @@ __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t star
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@ -442,7 +442,6 @@ __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t start
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -157,7 +157,6 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	// TODO: entfernen für eine Release! Ist nur zum Testen!
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@ -307,8 +307,6 @@ __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t star
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x11/cuda_x11_echo.cu
+++ b/x11/cuda_x11_echo.cu
@ -225,8 +225,6 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x11/cuda_x11_luffa512.cu
+++ b/x11/cuda_x11_luffa512.cu
@ -376,8 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@ -1372,8 +1372,6 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
    // Größe des dynamischen Shared Memory Bereichs
    size_t shared_size = 0;
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
    x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
    MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -162,11 +162,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
-	// TODO: entfernen für eine Release! Ist nur zum Testen!
+	if (opt_benchmark)
 	if (opt_benchmark) {
 		((uint32_t*)ptarget)[7] = 0x0000ff;
        pdata[17] = 0;
    }
 	const uint32_t Htarg = ptarget[7];