diff --git a/Makefile.am b/Makefile.am
index 6543670..390e525 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,13 +18,13 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
 			  cpu-miner.c util.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \
 			  sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \
-			  heavy.cu \
-			  cuda_blake512.cu cuda_blake512.h \
-			  cuda_combine.cu cuda_combine.h \
-			  cuda_groestl512.cu cuda_groestl512.h \
-			  cuda_hefty1.cu cuda_hefty1.h \
-			  cuda_keccak512.cu cuda_keccak512.h \
-			  cuda_sha256.cu cuda_sha256.h \
+			  heavy/heavy.cu \
+			  heavy/cuda_blake512.cu heavy/cuda_blake512.h \
+			  heavy/cuda_combine.cu heavy/cuda_combine.h \
+			  heavy/cuda_groestl512.cu heavy/cuda_groestl512.h \
+			  heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
+			  heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
+			  heavy/cuda_sha256.cu heavy/cuda_sha256.h \
 			  fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
 			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
 			  JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
diff --git a/README.txt b/README.txt
index fd05121..666101b 100644
--- a/README.txt
+++ b/README.txt
@@ -12,9 +12,8 @@ If you find this tool useful and like to support its continued
    VTC donation address: VrjeFzMgvteCGarLw85KivBzmsiH9fqp4a
    MAX donation address: mHrhQP9EFArechWxTFJ97s9D3jvcCvEEnt
   DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP
- PANDA donation address: PvgtxJ2ZKaudRogCXfUMLXVaWUMcKQgRed
-   MRC donation address: 1Lxc4JPDpQRJB8BN4YwhmSQ3Rcu8gjj2Kd
    HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN
+   GRS donation address: FmJKJAhvyHWPeEVeLQHefr2naqgWc9ABTM
 ***************************************************************
 
 >>> Introduction <<<
@@ -35,6 +34,7 @@ its command line interface and options.
 
   -a, --algo=ALGO       specify the algorithm to use
                           heavy       use to mine Heavycoin
+                          mjollnir    use to mine Mjollnircoin
                           fugue256    use to mine Fuguecoin
                           groestl     use to mine Groestlcoin
                           myr-gr      use to mine Myriad-Groestl
@@ -45,11 +45,12 @@ its command line interface and options.
                         Alternatively give string names of your card like
                         gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
 
+  -f, --diff            Divide difficulty by this factor (std is 1) \n\
+  -v, --vote            Heavycoin block vote (default: 512)
   -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")
   -O, --userpass=U:P    username:password pair for mining server
   -u, --user=USERNAME   username for mining server
   -p, --pass=PASSWORD   password for mining server
-  -v, --vote		Heavycoin block vote (default: 512)
       --cert=FILE       certificate for mining server using SSL
   -x, --proxy=[PROTOCOL://]HOST[:PORT]  connect through a proxy
   -t, --threads=N       number of miner threads (default: number of nVidia GPUs in your system)
@@ -116,6 +117,13 @@ from your old clunkers.
 
 >>> RELEASE HISTORY <<<
 
+  May 3rd 2014    add the MjollnirCoin hash algorithm for the upcomin
+                  MjollnirCoin relaunch.
+
+                  Add the -f (--diff) option to adjust the difficulty
+                  e.g. for the erebor Dwarfpool myr-gr SaffronCoin pool.
+                  Use -f 256 there.
+
   May 1st 2014    adapt the Jackpot algorithms to changes made by the
                   coin developers. We keep our unique nVidia advantage
                   because we have a way to break up the divergence.
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 8c58c9d..32094af 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -264,14 +264,15 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     <ClInclude Include="compat\sys\time.h" />
     <ClInclude Include="compat\unistd.h" />
     <ClInclude Include="cpuminer-config.h" />
-    <ClInclude Include="cuda_blake512.h" />
-    <ClInclude Include="cuda_combine.h" />
-    <ClInclude Include="cuda_groestl512.h" />
     <ClInclude Include="cuda_groestlcoin.h" />
-    <ClInclude Include="cuda_hefty1.h" />
-    <ClInclude Include="cuda_keccak512.h" />
-    <ClInclude Include="cuda_sha256.h" />
+    <ClInclude Include="cuda_helper.h" />
     <ClInclude Include="elist.h" />
+    <ClInclude Include="heavy\cuda_blake512.h" />
+    <ClInclude Include="heavy\cuda_combine.h" />
+    <ClInclude Include="heavy\cuda_groestl512.h" />
+    <ClInclude Include="heavy\cuda_hefty1.h" />
+    <ClInclude Include="heavy\cuda_keccak512.h" />
+    <ClInclude Include="heavy\cuda_sha256.h" />
     <ClInclude Include="hefty1.h" />
     <ClInclude Include="miner.h" />
     <ClInclude Include="sph\sph_blake.h" />
@@ -290,16 +291,16 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
     <ClInclude Include="uint256.h" />
   </ItemGroup>
   <ItemGroup>
-    <CudaCompile Include="cuda_blake512.cu" />
-    <CudaCompile Include="cuda_combine.cu" />
     <CudaCompile Include="cuda_fugue256.cu" />
-    <CudaCompile Include="cuda_groestl512.cu" />
     <CudaCompile Include="cuda_groestlcoin.cu" />
-    <CudaCompile Include="cuda_hefty1.cu" />
-    <CudaCompile Include="cuda_keccak512.cu" />
     <CudaCompile Include="cuda_myriadgroestl.cu" />
-    <CudaCompile Include="cuda_sha256.cu" />
-    <CudaCompile Include="heavy.cu" />
+    <CudaCompile Include="heavy\cuda_blake512.cu" />
+    <CudaCompile Include="heavy\cuda_combine.cu" />
+    <CudaCompile Include="heavy\cuda_groestl512.cu" />
+    <CudaCompile Include="heavy\cuda_hefty1.cu" />
+    <CudaCompile Include="heavy\cuda_keccak512.cu" />
+    <CudaCompile Include="heavy\cuda_sha256.cu" />
+    <CudaCompile Include="heavy\heavy.cu" />
     <CudaCompile Include="JHA\cuda_jha_compactionTest.cu" />
     <CudaCompile Include="JHA\cuda_jha_keccak512.cu" />
     <CudaCompile Include="JHA\jackpotcoin.cu" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index f9ca3e1..c331f60 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -46,6 +46,12 @@
     <Filter Include="Header Files\sph">
       <UniqueIdentifier>{7c2a98c6-064c-4a69-b803-d6f6ff5edd0b}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Source Files\CUDA\heavy">
+      <UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\CUDA\heavy">
+      <UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="compat\jansson\dump.c">
@@ -167,24 +173,6 @@
     <ClInclude Include="hefty1.h">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="cuda_sha256.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
-    <ClInclude Include="cuda_hefty1.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
-    <ClInclude Include="cuda_keccak512.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
-    <ClInclude Include="cuda_combine.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
-    <ClInclude Include="cuda_blake512.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
-    <ClInclude Include="cuda_groestl512.h">
-      <Filter>Header Files\CUDA</Filter>
-    </ClInclude>
     <ClInclude Include="uint256.h">
       <Filter>Header Files</Filter>
     </ClInclude>
@@ -230,29 +218,29 @@
     <ClInclude Include="sph\sph_types.h">
       <Filter>Header Files\sph</Filter>
     </ClInclude>
+    <ClInclude Include="heavy\cuda_blake512.h">
+      <Filter>Header Files\CUDA\heavy</Filter>
+    </ClInclude>
+    <ClInclude Include="heavy\cuda_combine.h">
+      <Filter>Header Files\CUDA\heavy</Filter>
+    </ClInclude>
+    <ClInclude Include="heavy\cuda_groestl512.h">
+      <Filter>Header Files\CUDA\heavy</Filter>
+    </ClInclude>
+    <ClInclude Include="heavy\cuda_hefty1.h">
+      <Filter>Header Files\CUDA\heavy</Filter>
+    </ClInclude>
+    <ClInclude Include="heavy\cuda_keccak512.h">
+      <Filter>Header Files\CUDA\heavy</Filter>
+    </ClInclude>
+    <ClInclude Include="heavy\cuda_sha256.h">
+      <Filter>Header Files\CUDA\heavy</Filter>
+    </ClInclude>
+    <ClInclude Include="cuda_helper.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
-    <CudaCompile Include="cuda_sha256.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="cuda_blake512.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="cuda_groestl512.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="cuda_hefty1.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="cuda_keccak512.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="cuda_combine.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
-    <CudaCompile Include="heavy.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
     <CudaCompile Include="cuda_fugue256.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
@@ -286,5 +274,26 @@
     <CudaCompile Include="quark\cuda_skein512.cu">
       <Filter>Source Files\CUDA\quark</Filter>
     </CudaCompile>
+    <CudaCompile Include="heavy\cuda_blake512.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_combine.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_groestl512.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_hefty1.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_keccak512.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\cuda_sha256.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
+    <CudaCompile Include="heavy\heavy.cu">
+      <Filter>Source Files\CUDA\heavy</Filter>
+    </CudaCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/configure.ac b/configure.ac
index 3b4e438..9fa301d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.05.01])
+AC_INIT([ccminer], [2014.05.03])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpu-miner.c b/cpu-miner.c
index 784aa1b..933b443 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -47,6 +47,7 @@
 #define PROGRAM_NAME		"minerd"
 #define LP_SCANTIME		60
 #define HEAVYCOIN_BLKHDR_SZ		84
+#define MNR_BLKHDR_SZ 80
 
 // from heavy.cu
 #ifdef __cplusplus
@@ -121,6 +122,7 @@ struct workio_cmd {
 
 typedef enum {
 	ALGO_HEAVY,		/* Heavycoin hash */
+	ALGO_MJOLLNIR,		/* Mjollnir hash */
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GROESTL,
 	ALGO_MYR_GR,
@@ -129,6 +131,7 @@ typedef enum {
 
 static const char *algo_names[] = {
 	"heavy",
+	"mjollnir",
 	"fugue256",
 	"groestl",
 	"myr-gr",
@@ -154,6 +157,7 @@ static json_t *opt_config;
 static const bool opt_time = true;
 static sha256_algos opt_algo = ALGO_HEAVY;
 static int opt_n_threads = 0;
+static double opt_difficulty = 1; // CH
 bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
 static int num_processors;
@@ -195,6 +199,7 @@ Options:\n\
   -a, --algo=ALGO       specify the algorithm to use\n\
                         fugue256  Fuguecoin hash\n\
                         heavy     Heavycoin hash\n\
+                        mjollnir  Mjollnircoin hash\n\
                         groestl   Groestlcoin hash\n\
                         myr-gr    Myriad-Groestl hash\n\
                         jackpot   Jackpot hash\n\
@@ -244,7 +249,7 @@ static char const short_options[] =
 #ifdef HAVE_SYSLOG_H
 	"S"
 #endif
-	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
+	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:";
 
 static struct option const options[] = {
 	{ "algo", 1, NULL, 'a' },
@@ -277,6 +282,7 @@ static struct option const options[] = {
 	{ "userpass", 1, NULL, 'O' },
 	{ "version", 0, NULL, 'V' },
 	{ "devices", 1, NULL, 'd' },
+	{ "diff", 1, NULL, 'f' },
 	{ 0, 0, 0, 0 }
 };
 
@@ -684,7 +690,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
 
 	/* Generate merkle root */
-	if (opt_algo == ALGO_HEAVY)
+	if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
 		heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
 	else
 	if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL)
@@ -694,7 +700,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 
 	for (i = 0; i < sctx->job.merkle_count; i++) {
 		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
-		if (opt_algo == ALGO_HEAVY)
+		if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
 			heavycoin_hash(merkle_root, merkle_root, 64);
 		else
 			sha256d(merkle_root, merkle_root, 64);
@@ -738,11 +744,11 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	}
 
 	if (opt_algo == ALGO_JACKPOT)
-		diff_to_target(work->target, sctx->job.diff / 65536.0);
+		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
 	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL)
-		diff_to_target(work->target, sctx->job.diff / 256.0);
+		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
 	else
-		diff_to_target(work->target, sctx->job.diff);
+		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
 }
 
 static void *miner_thread(void *userdata)
@@ -836,7 +842,12 @@ static void *miner_thread(void *userdata)
 
 		case ALGO_HEAVY:
 			rc = scanhash_heavy(thr_id, work.data, work.target,
-			                      max_nonce, &hashes_done, work.maxvote);
+			                      max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
+			break;
+
+		case ALGO_MJOLLNIR:
+			rc = scanhash_heavy(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ);
 			break;
 
 		case ALGO_FUGUE256:
@@ -1112,6 +1123,7 @@ static void parse_arg (int key, char *arg)
 {
 	char *p;
 	int v, i;
+	double d;
 
 	switch(key) {
 	case 'a':
@@ -1309,6 +1321,12 @@ static void parse_arg (int key, char *arg)
 			}
 		}
 		break;
+	case 'f': // CH - Divisor for Difficulty
+		d = atof(arg);
+		if (d == 0)	/* sanity check */
+			show_usage_and_exit(1);
+		opt_difficulty = d;
+		break;
 	case 'V':
 		show_version_and_exit();
 	case 'h':
@@ -1404,7 +1422,7 @@ static void signal_handler(int sig)
 }
 #endif
 
-#define PROGRAM_VERSION "0.7"
+#define PROGRAM_VERSION "0.8"
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 61cbc3a..31f94fc 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -152,7 +152,7 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.05.01"
+#define PACKAGE_STRING "ccminer 2014.05.03"
 
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
@@ -161,7 +161,7 @@
 #undef PACKAGE_URL
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.05.01"
+#define PACKAGE_VERSION "2014.05.03"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index 258f11b..e153e5c 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -16,9 +16,6 @@ extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 
-// aus driver.c
-extern "C" void set_device(int device);
-
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned char uint8_t;
 typedef unsigned int uint32_t;
diff --git a/cuda_blake512.cu b/heavy/cuda_blake512.cu
similarity index 63%
rename from cuda_blake512.cu
rename to heavy/cuda_blake512.cu
index 013b7e1..dd1c737 100644
--- a/cuda_blake512.cu
+++ b/heavy/cuda_blake512.cu
@@ -17,8 +17,8 @@ extern uint32_t *d_nonceVector[8];
 // globaler Speicher für unsere Ergebnisse
 uint32_t *d_hash5output[8];
 
-// die Message (116 Bytes) mit Padding zur Berechnung auf der GPU
-__constant__ uint64_t c_PaddedMessage[16]; // padded message (84+32 bytes + padding)
+// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
+__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
 
 // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
 
@@ -44,10 +44,12 @@ const uint8_t host_sigma[16][16] =
   { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };
 
+// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
 #define SWAP32(x) \
     ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
       (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
 
+// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
 #define SWAP64(x) \
     ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
                 (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
@@ -58,11 +60,11 @@ const uint8_t host_sigma[16][16] =
                 (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
                 (((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
 
-__constant__ uint64_t c_SecondRound[16];
+__constant__ uint64_t c_SecondRound[15];
 
-const uint64_t host_SecondRound[16] =
+const uint64_t host_SecondRound[15] =
 {
-  0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0,SWAP64(0x3A0)
+  0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0
 };
 
 __constant__ uint64_t c_u512[16];
@@ -80,24 +82,22 @@ const uint64_t host_u512[16] =
 };
 
 
-#define ROTR(x,n) (((x)<<(64-n))|( (x)>>(n)))
-
 #define G(a,b,c,d,e)          \
     v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
-    v[d] = ROTR( v[d] ^ v[a],32);        \
+    v[d] = ROTR64( v[d] ^ v[a],32);        \
     v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],25);        \
+    v[b] = ROTR64( v[b] ^ v[c],25);        \
     v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b];  \
-    v[d] = ROTR( v[d] ^ v[a],16);        \
+    v[d] = ROTR64( v[d] ^ v[a],16);        \
     v[c] += v[d];           \
-    v[b] = ROTR( v[b] ^ v[c],11);
+    v[b] = ROTR64( v[b] ^ v[c],11);
 
-__device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 )
+template <int BLOCKSIZE> __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 )
 {
     uint64_t v[16], m[16], i;
 
 #pragma unroll 16
-    for( i = 0; i < 16; ++i )  m[i] = SWAP64(block[i]);
+    for( i = 0; i < 16; ++i )  m[i] = cuda_swab64(block[i]);
 
 #pragma unroll 8
     for( i = 0; i < 8; ++i )  v[i] = h[i];
@@ -113,11 +113,11 @@ __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt
 
     /* don't xor t when the block is only padding */
     if ( !nullt ) {
-        v[12] ^= 928;
-        v[13] ^= 928;
+        v[12] ^= 8*(BLOCKSIZE+32);
+        v[13] ^= 8*(BLOCKSIZE+32);
     }
 
-#pragma unroll 16
+//#pragma unroll 16
     for( i = 0; i < 16; ++i )
     {
         /* column step */
@@ -136,49 +136,9 @@ __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt
     for( i = 0; i < 16; ++i )  h[i % 8] ^= v[i];
 }
 
-// Endian Drehung für 32 Bit Typen
-static __device__ uint32_t cuda_swab32(uint32_t x)
-{
-    return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
-          | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
-}
+#include "cuda_helper.h"
 
-// Endian Drehung für 64 Bit Typen
-static __device__ uint64_t cuda_swab64(uint64_t x) {
-    uint32_t h = (x >> 32);
-    uint32_t l = (x & 0xFFFFFFFFULL);
-    return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
-}
-
-// das Hi Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t HIWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2hiint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x >> 32);
-#endif
-}
-
-// das Hi Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
-}
-
-// das Lo Word aus einem 64 Bit Typen extrahieren
-static __device__ uint32_t LOWORD(const uint64_t &x) {
-#if __CUDA_ARCH__ >= 130
-	return (uint32_t)__double2loint(__longlong_as_double(x));
-#else
-	return (uint32_t)(x & 0xFFFFFFFFULL);
-#endif
-}
-
-// das Lo Word in einem 64 Bit Typen ersetzen
-static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
-	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
-}
-
-__global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -211,40 +171,40 @@ __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outpu
 		// die Nounce durch die thread-spezifische ersetzen
 		buf[9] = REPLACE_HIWORD(buf[9], nounce);
 
-		// den thread-spezifischen Hefty1 hash einsetzen
 		uint32_t *hefty = heftyHashes + 8 * hashPosition;
-		buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
-		buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
-		buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
-		buf[12] = REPLACE_LOWORD(buf[12], hefty[3]);
-		buf[12] = REPLACE_HIWORD(buf[12], hefty[4]);
-		buf[13] = REPLACE_LOWORD(buf[13], hefty[5]);
-		buf[13] = REPLACE_HIWORD(buf[13], hefty[6]);
-		buf[14] = REPLACE_LOWORD(buf[14], hefty[7]);
+		if (BLOCKSIZE == 84) {
+			// den thread-spezifischen Hefty1 hash einsetzen
+			// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
+			buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
+			buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
+			buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
+			buf[12] = REPLACE_LOWORD(buf[12], hefty[3]);
+			buf[12] = REPLACE_HIWORD(buf[12], hefty[4]);
+			buf[13] = REPLACE_LOWORD(buf[13], hefty[5]);
+			buf[13] = REPLACE_HIWORD(buf[13], hefty[6]);
+			buf[14] = REPLACE_LOWORD(buf[14], hefty[7]);
+		}
+		else if (BLOCKSIZE == 80) {
+			buf[10] = MAKE_ULONGLONG(hefty[0], hefty[1]);
+			buf[11] = MAKE_ULONGLONG(hefty[2], hefty[3]);
+			buf[12] = MAKE_ULONGLONG(hefty[4], hefty[5]);
+			buf[13] = MAKE_ULONGLONG(hefty[6], hefty[7]);
+		}
 
 		// erste Runde
-		blake512_compress( h, buf, 0, c_sigma, c_u512 );
-
+		blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );
+		
+		
 		// zweite Runde
-#pragma unroll 16
-		for (int i=0; i < 16; ++i) buf[i] = c_SecondRound[i];
-		blake512_compress( h, buf, 1, c_sigma, c_u512 );
-
+#pragma unroll 15
+		for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
+		buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
+		blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );
+		
 		// Hash rauslassen
-#if 0
-		// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
-		uint32_t *outHash = (uint32_t *)outputHash + 16 * hashPosition;
-#pragma unroll 8
-		for (int i=0; i < 8; ++i) {
-			outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
-			outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
-		}
-#else
-		// in dieser Version passieren auch ein paar 64 Bit Shifts
 		uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
 #pragma unroll 8
 		for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] );
-#endif
 	}
 }
 
@@ -274,22 +234,30 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
 	cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
 }
 
-__host__ void blake512_cpu_setBlock(void *pdata)
+static int BLOCKSIZE = 84;
+
+__host__ void blake512_cpu_setBlock(void *pdata, int len)
 	// data muss 84-Byte haben!
 	// heftyHash hat 32-Byte
 {
-	// Message mit Padding für erste Runde bereitstellen
 	unsigned char PaddedMessage[128];
-	memcpy(PaddedMessage, pdata, 84);
-	memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
-	memset(PaddedMessage+116, 0, 12);
-	PaddedMessage[116] = 0x80;
-
+	if (len == 84) {
+		// Message mit Padding für erste Runde bereitstellen
+		memcpy(PaddedMessage, pdata, 84);
+		memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
+		memset(PaddedMessage+116, 0, 12);
+		PaddedMessage[116] = 0x80;
+	} else if (len == 80) {
+		memcpy(PaddedMessage, pdata, 80);
+		memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
+		memset(PaddedMessage+112, 0, 16);
+		PaddedMessage[112] = 0x80;
+	}
 	// die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU
 	cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+	BLOCKSIZE = len;
 }
 
-
 __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 {
 	const int threadsperblock = 256;
@@ -303,5 +271,8 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 
-	blake512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+	if (BLOCKSIZE == 80)
+		blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+	else if (BLOCKSIZE == 84)
+		blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 }
diff --git a/cuda_blake512.h b/heavy/cuda_blake512.h
similarity index 78%
rename from cuda_blake512.h
rename to heavy/cuda_blake512.h
index 48bd3ff..7e24973 100644
--- a/cuda_blake512.h
+++ b/heavy/cuda_blake512.h
@@ -2,7 +2,6 @@
 #define _CUDA_BLAKE512_H
 
 void blake512_cpu_init(int thr_id, int threads);
-void blake512_cpu_setBlock(void *pdata);
+void blake512_cpu_setBlock(void *pdata, int len);
 void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
-
 #endif
diff --git a/cuda_combine.cu b/heavy/cuda_combine.cu
similarity index 100%
rename from cuda_combine.cu
rename to heavy/cuda_combine.cu
diff --git a/cuda_combine.h b/heavy/cuda_combine.h
similarity index 100%
rename from cuda_combine.h
rename to heavy/cuda_combine.h
diff --git a/cuda_groestl512.cu b/heavy/cuda_groestl512.cu
similarity index 98%
rename from cuda_groestl512.cu
rename to heavy/cuda_groestl512.cu
index 6875404..bf86105 100644
--- a/cuda_groestl512.cu
+++ b/heavy/cuda_groestl512.cu
@@ -676,7 +676,7 @@ __device__ void groestl512_perm_Q(uint32_t *a)
 	}
 }
 
-__global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -706,7 +706,7 @@ __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *out
 		uint32_t *heftyHash = &heftyHashes[8 * hashPosition];
 #pragma unroll 8
 		for (int k=0; k<8; ++k)
-			message[21+k] = heftyHash[k];
+			message[BLOCKSIZE/4+k] = heftyHash[k];
 		
 		uint32_t g[32];
 #pragma unroll 32
@@ -764,21 +764,27 @@ __host__ void groestl512_cpu_init(int thr_id, int threads)
 	cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
 }
 
-__host__ void groestl512_cpu_setBlock(void *data)
-	// data muss 84-Byte haben!
+static int BLOCKSIZE = 84;
+
+__host__ void groestl512_cpu_setBlock(void *data, int len)
+	// data muss 80/84-Byte haben!
 	// heftyHash hat 32-Byte
 {
 	// Nachricht expandieren und setzen
 	uint32_t msgBlock[32];
 
 	memset(msgBlock, 0, sizeof(uint32_t) * 32);
-	memcpy(&msgBlock[0], data, 84);
+	memcpy(&msgBlock[0], data, len);
 
 	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
-	// Unsere Nachricht hat 116 Byte
-	msgBlock[29] = 0x80;
-	msgBlock[31] = 0x01000000;
-
+	// Unsere Nachricht hat 112/116 Byte
+	if (len == 84) {
+		msgBlock[29] = 0x80;
+		msgBlock[31] = 0x01000000;
+	} else if (len == 80) {
+		msgBlock[28] = 0x80;
+		msgBlock[31] = 0x01000000;
+	}
 	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
 	// auf der GPU ausgeführt)
 
@@ -796,6 +802,8 @@ __host__ void groestl512_cpu_setBlock(void *data)
 	cudaMemcpyToSymbol(	groestl_gpu_msg,
 						msgBlock,
 						128);
+	
+	BLOCKSIZE = len;
 }
 
 __host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
@@ -818,5 +826,8 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 
-	groestl512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
+	if (BLOCKSIZE == 84)
+		groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
+	else if (BLOCKSIZE == 80)
+		groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);	
 }
diff --git a/cuda_groestl512.h b/heavy/cuda_groestl512.h
similarity index 82%
rename from cuda_groestl512.h
rename to heavy/cuda_groestl512.h
index 0e77f2f..0cdc13b 100644
--- a/cuda_groestl512.h
+++ b/heavy/cuda_groestl512.h
@@ -3,7 +3,7 @@
 
 void groestl512_cpu_init(int thr_id, int threads);
 void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
-void groestl512_cpu_setBlock(void *data);
+void groestl512_cpu_setBlock(void *data, int len);
 void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
 
 #endif
\ No newline at end of file
diff --git a/cuda_hefty1.cu b/heavy/cuda_hefty1.cu
similarity index 93%
rename from cuda_hefty1.cu
rename to heavy/cuda_hefty1.cu
index 239752f..db5fca3 100644
--- a/cuda_hefty1.cu
+++ b/heavy/cuda_hefty1.cu
@@ -2,21 +2,24 @@
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 
-// aus cpu-miner.c
-extern int device_map[8];
-
 #include <stdio.h>
 #include <memory.h>
 
 #define USE_SHARED 1
 
+// aus cpu-miner.c
+extern int device_map[8];
+
+// aus heavy.cu
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned int uint32_t;
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 
 // diese Struktur wird in der Init Funktion angefordert
-static cudaDeviceProp props;
+static cudaDeviceProp props[8];
 
 // globaler Speicher für alle HeftyHashes aller Threads
 uint32_t *d_heftyHashes[8];
@@ -286,7 +289,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
             for(int j=0;j<16;j++)
             {
                 Absorb(sponge, regs[3] + regs[7]);
-                hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
+                hefty_gpu_round(regs, W2[j], heftyLookUp(j + ((k+1)<<4)), sponge);
             }
     #pragma unroll 16
             for(int j=0;j<16;j++)
@@ -299,7 +302,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
 
 #pragma unroll 8
         for(int k=0;k<8;k++)
-            ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
+            ((uint32_t*)outputHash)[(thread<<3)+k] = SWAB32(hash[k]);
     }
 }
 
@@ -308,7 +311,7 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
 {
     cudaSetDevice(device_map[thr_id]);
 
-    cudaGetDeviceProperties(&props, device_map[thr_id]);
+    cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
 
     // Kopiere die Hash-Tabellen in den GPU-Speicher
     cudaMemcpyToSymbol(    hefty_gpu_constantTable,
@@ -319,16 +322,21 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
     cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
 }
 
-__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
-    // data muss 84-Byte haben!
+__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
+// data muss 80/84-Byte haben!
 {
     // Nachricht expandieren und setzen
     uint32_t msgBlock[32];
 
     memset(msgBlock, 0, sizeof(uint32_t) * 32);
-    memcpy(&msgBlock[0], data, 84);
-    msgBlock[21] |= 0x80;
-    msgBlock[31] = 672; // bitlen
+    memcpy(&msgBlock[0], data, len);
+    if (len == 84) {
+        msgBlock[21] |= 0x80;
+        msgBlock[31] = 672; // bitlen
+    } else if (len == 80) {
+        msgBlock[20] |= 0x80;
+        msgBlock[31] = 640; // bitlen
+    }
     
     for(int i=0;i<31;i++) // Byteorder drehen
         msgBlock[i] = SWAB32(msgBlock[i]);
@@ -395,7 +403,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
 {
     // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
     // alle anderen mit 512 Threads.
-    int threadsperblock = (props.major >= 3) ? 768 : 512;
+    int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
 
     // berechne wie viele Thread Blocks wir brauchen
     dim3 grid((threads + threadsperblock-1)/threadsperblock);
@@ -411,4 +419,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
 //    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 
     hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
+
+    // Strategisches Sleep Kommando zur Senkung der CPU Last
+    MyStreamSynchronize(NULL, 0, thr_id);
 }
diff --git a/cuda_hefty1.h b/heavy/cuda_hefty1.h
similarity index 67%
rename from cuda_hefty1.h
rename to heavy/cuda_hefty1.h
index 9e72d3d..17b196c 100644
--- a/cuda_hefty1.h
+++ b/heavy/cuda_hefty1.h
@@ -2,7 +2,7 @@
 #define _CUDA_HEFTY1_H
 
 void hefty_cpu_hash(int thr_id, int threads, int startNounce);
-void hefty_cpu_setBlock(int thr_id, int threads, void *data);
+void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
 void hefty_cpu_init(int thr_id, int threads);
 
 #endif
\ No newline at end of file
diff --git a/cuda_keccak512.cu b/heavy/cuda_keccak512.cu
similarity index 88%
rename from cuda_keccak512.cu
rename to heavy/cuda_keccak512.cu
index 13e5255..9585793 100644
--- a/cuda_keccak512.cu
+++ b/heavy/cuda_keccak512.cu
@@ -16,6 +16,8 @@ extern uint32_t *d_nonceVector[8];
 
 // globaler Speicher für unsere Ergebnisse
 uint32_t *d_hash3output[8];
+extern uint32_t *d_hash4output[8];
+extern uint32_t *d_hash5output[8];
 
 // der Keccak512 State nach der ersten Runde (72 Bytes)
 __constant__ uint64_t c_State[25];
@@ -25,7 +27,7 @@ __constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (No
 
 // ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
 
-#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b)))
+#include "cuda_helper.h"
 
 #define U32TO64_LE(p) \
     (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
@@ -145,7 +147,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
 }
 
 // Die Hash-Funktion
-__global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -171,7 +173,7 @@ __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outp
 		msgBlock[1] = nounce;
 
 		// den individuellen Hefty1 Hash einsetzen
-		mycpy32(&msgBlock[3], &heftyHashes[8 * hashPosition]);
+		mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
 
 		// den Block einmal gut durchschütteln
 		keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
@@ -184,7 +186,6 @@ __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outp
 			U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
 		}
 
-
 		// und ins Global Memory rausschreiben
 #pragma unroll 16
 		for(int k=0;k<16;k++)
@@ -217,38 +218,49 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
 
 // --------------- END keccak512 CPU version from scrypt-jane code --------------------
 
-__host__ void keccak512_cpu_setBlock(void *data)
-	// data muss 84-Byte haben!
+static int BLOCKSIZE = 84;
+
+__host__ void keccak512_cpu_setBlock(void *data, int len)
+	// data muss 80 oder 84-Byte haben!
 	// heftyHash hat 32-Byte
 {
 	// CH
 	// state init	
 	uint64_t keccak_cpu_state[25];
-	memset(keccak_cpu_state, 0, 200);
+	memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state));
+
+	// erste Runde	
+	keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
+
+	// state kopieren
+	cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 
 	// keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
 	// zu jeweils 
 	uint32_t msgBlock[18];
 	memset(msgBlock, 0, 18 * sizeof(uint32_t));
 
-	// kopiere die Daten rein (aber nur alles nach Bit 72)
-	memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12);
+	// kopiere die restlichen Daten rein (aber nur alles nach Byte 72)
+	if (len == 84)
+		memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12);
+	else if (len == 80)
+		memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
 
 	// Nachricht abschließen
-	msgBlock[11] = 0x01;
+	if (len == 84)
+		msgBlock[11] = 0x01;
+	else if (len == 80)
+		msgBlock[10] = 0x01;
 	msgBlock[17] = 0x80000000;
 	
-	// erste Runde	
-	keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
-
 	// Message 2 ins Constant Memory kopieren (die variable Nonce und 
 	// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
 	cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
 
-	// state kopieren
-	cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+	BLOCKSIZE = len;
 }
 
+
 __host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
 {
 	// Hefty1 Hashes kopieren
@@ -268,6 +280,8 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	size_t shared_size = 0;
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
-	keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+	if (BLOCKSIZE==84)
+		keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+	else if (BLOCKSIZE==80)
+		keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
 }
diff --git a/cuda_keccak512.h b/heavy/cuda_keccak512.h
similarity index 84%
rename from cuda_keccak512.h
rename to heavy/cuda_keccak512.h
index 003f40f..1182447 100644
--- a/cuda_keccak512.h
+++ b/heavy/cuda_keccak512.h
@@ -2,7 +2,7 @@
 #define _CUDA_KECCAK512_H
 
 void keccak512_cpu_init(int thr_id, int threads);
-void keccak512_cpu_setBlock(void *data);
+void keccak512_cpu_setBlock(void *data, int len);
 void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
 void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
 
diff --git a/cuda_sha256.cu b/heavy/cuda_sha256.cu
similarity index 86%
rename from cuda_sha256.cu
rename to heavy/cuda_sha256.cu
index b26021d..404a2a2 100644
--- a/cuda_sha256.cu
+++ b/heavy/cuda_sha256.cu
@@ -47,7 +47,7 @@ uint32_t sha256_cpu_constantTable[] = {
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 
 // Die Hash-Funktion
-__global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
+template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -82,11 +82,10 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
 		uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
 #pragma unroll 8
 		for(int k=0;k<8;k++)
-			W1[5+k] = heftyHashes[offset + k];
-
+			W1[((BLOCKSIZE-64)/4)+k] = heftyHashes[offset + k];
 
 #pragma unroll 8
-		for (int i=5; i <5+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;)
+		for (int i=((BLOCKSIZE-64)/4); i < ((BLOCKSIZE-64)/4)+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;)
 		W1[3] = SWAB32(nounce);
 
 // Progress W1
@@ -178,18 +177,26 @@ __host__ void sha256_cpu_init(int thr_id, int threads)
 	cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
 }
 
-__host__ void sha256_cpu_setBlock(void *data)
-	// data muss 84-Byte haben!
+static int BLOCKSIZE = 84;
+
+__host__ void sha256_cpu_setBlock(void *data, int len)
+	// data muss 80/84-Byte haben!
 	// heftyHash hat 32-Byte
 {
 	// Nachricht expandieren und setzen
 	uint32_t msgBlock[32];
 
 	memset(msgBlock, 0, sizeof(uint32_t) * 32);
-	memcpy(&msgBlock[0], data, 84);
-	memset(&msgBlock[21], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
-	msgBlock[29] |= 0x80;
-	msgBlock[31] = 928; // bitlen
+	memcpy(&msgBlock[0], data, len);
+	if (len == 84) {
+		memset(&msgBlock[21], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
+		msgBlock[29] |= 0x80;
+		msgBlock[31] = 928; // bitlen
+	} else if (len == 80) {
+		memset(&msgBlock[20], 0, 32); // vorläufig  Nullen anstatt der Hefty1 Hashes einfüllen
+		msgBlock[28] |= 0x80;
+		msgBlock[31] = 896; // bitlen
+	}
 	
 	for(int i=0;i<31;i++) // Byteorder drehen
 		msgBlock[i] = SWAB32(msgBlock[i]);
@@ -209,7 +216,7 @@ __host__ void sha256_cpu_setBlock(void *data)
 	uint32_t hash[8];
 
 	// pre
-    for (int k=0; k < 8; k++)
+	for (int k=0; k < 8; k++)
 	{
 		regs[k] = sha256_cpu_hashTable[k];
 		hash[k] = regs[k];
@@ -242,6 +249,8 @@ __host__ void sha256_cpu_setBlock(void *data)
 	cudaMemcpyToSymbol(	sha256_gpu_blockHeader,
 						&msgBlock[16],
 						64);
+
+	BLOCKSIZE = len;
 }
 
 __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
@@ -263,6 +272,9 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
 	size_t shared_size = 0;
 
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-
-	sha256_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+	if (BLOCKSIZE == 84)
+		sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+	else if (BLOCKSIZE == 80) {
+		sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
+	}
 }
diff --git a/cuda_sha256.h b/heavy/cuda_sha256.h
similarity index 84%
rename from cuda_sha256.h
rename to heavy/cuda_sha256.h
index 9efd170..03385d1 100644
--- a/cuda_sha256.h
+++ b/heavy/cuda_sha256.h
@@ -2,7 +2,7 @@
 #define _CUDA_SHA256_H
 
 void sha256_cpu_init(int thr_id, int threads);
-void sha256_cpu_setBlock(void *data);
+void sha256_cpu_setBlock(void *data, int len);
 void sha256_cpu_hash(int thr_id, int threads, int startNounce);
 void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
 #endif
diff --git a/heavy.cu b/heavy/heavy.cu
similarity index 90%
rename from heavy.cu
rename to heavy/heavy.cu
index 266275a..de86b42 100644
--- a/heavy.cu
+++ b/heavy/heavy.cu
@@ -22,12 +22,12 @@
 #include "sph/sph_blake.h"
 #include "sph/sph_groestl.h"
 
-#include "cuda_hefty1.h"
-#include "cuda_sha256.h"
-#include "cuda_keccak512.h"
-#include "cuda_groestl512.h"
-#include "cuda_blake512.h"
-#include "cuda_combine.h"
+#include "heavy/cuda_hefty1.h"
+#include "heavy/cuda_sha256.h"
+#include "heavy/cuda_keccak512.h"
+#include "heavy/cuda_groestl512.h"
+#include "heavy/cuda_blake512.h"
+#include "heavy/cuda_combine.h"
 
 extern uint32_t *d_hash2output[8];
 extern uint32_t *d_hash3output[8];
@@ -35,6 +35,7 @@ extern uint32_t *d_hash4output[8];
 extern uint32_t *d_hash5output[8];
 
 #define HEAVYCOIN_BLKHDR_SZ        84
+#define MNR_BLKHDR_SZ		       80
 
 // nonce-array für die threads
 uint32_t *d_nonceVector[8];
@@ -230,24 +231,29 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
 
 int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
  const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote);
+ unsigned long *hashes_done, uint32_t maxvote, int blocklen);
 
 extern "C"
 int scanhash_heavy(int thr_id, uint32_t *pdata,
  const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote)
+ unsigned long *hashes_done, uint32_t maxvote, int blocklen)
 {
  return scanhash_heavy_cpp(thr_id, pdata,
-  ptarget, max_nonce, hashes_done, maxvote);
+  ptarget, max_nonce, hashes_done, maxvote, blocklen);
 }
 
+extern bool opt_benchmark;
+
 int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
  const uint32_t *ptarget, uint32_t max_nonce,
- unsigned long *hashes_done, uint32_t maxvote)
+ unsigned long *hashes_done, uint32_t maxvote, int blocklen)
 {
     // CUDA will process thousands of threads.
     const int throughput = 4096 * 128;
 
+    if (opt_benchmark)
+        ((uint32_t*)ptarget)[7] = 0x000000ff;
+
     int rc = 0;
     uint32_t *hash = NULL;
     cudaMallocHost(&hash, throughput*8*sizeof(uint32_t));
@@ -258,7 +264,6 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
     memset(nrmCalls, 0, sizeof(int) * 6);
 
     uint32_t start_nonce = pdata[19];    
-    uint16_t *ext = (uint16_t *)&pdata[20];
 
     // für jeden Hash ein individuelles Target erstellen basierend
     // auf dem höchsten Bit, das in ptarget gesetzt ist.
@@ -282,26 +287,30 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
         cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput);
     }
 
+    if (blocklen == HEAVYCOIN_BLKHDR_SZ)
+    {
+        uint16_t *ext = (uint16_t *)&pdata[20];
 
-    if (opt_vote > maxvote) {
-        printf("Warning: Your block reward vote (%hu) exceeds "
-                "the maxvote reported by the pool (%hu).\n",
-                opt_vote, maxvote);
-    }
+        if (opt_vote > maxvote) {
+            printf("Warning: Your block reward vote (%hu) exceeds "
+                    "the maxvote reported by the pool (%hu).\n",
+                    opt_vote, maxvote);
+        }
 
-    if (opt_trust_pool && opt_vote > maxvote) {
-        printf("Warning: Capping block reward vote to maxvote reported by pool.\n");
-        ext[0] = maxvote;
+        if (opt_trust_pool && opt_vote > maxvote) {
+            printf("Warning: Capping block reward vote to maxvote reported by pool.\n");
+            ext[0] = maxvote;
+        }
+        else
+            ext[0] = opt_vote;
     }
-    else
-        ext[0] = opt_vote;
 
     // Setze die Blockdaten
-    hefty_cpu_setBlock(thr_id, throughput, pdata);
-    sha256_cpu_setBlock(pdata);
-    keccak512_cpu_setBlock(pdata);
-    groestl512_cpu_setBlock(pdata);
-    blake512_cpu_setBlock(pdata);
+    hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
+    sha256_cpu_setBlock(pdata, blocklen);
+    keccak512_cpu_setBlock(pdata, blocklen);
+    groestl512_cpu_setBlock(pdata, blocklen);
+    blake512_cpu_setBlock(pdata, blocklen);
 
     do {
         int i;
@@ -370,7 +379,7 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
                     if (fulltest(foundhash, ptarget)) {
                         uint32_t verification[8];
                         pdata[19] += nonce - pdata[19];
-                        heavycoin_hash((unsigned char *)verification, (const unsigned char *)pdata, HEAVYCOIN_BLKHDR_SZ);
+                        heavycoin_hash((unsigned char *)verification, (const unsigned char *)pdata, blocklen);
                         if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
                             applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
                         }
diff --git a/miner.h b/miner.h
index ffea67c..7a68b37 100644
--- a/miner.h
+++ b/miner.h
@@ -205,7 +205,7 @@ extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
 
 extern int scanhash_heavy(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done, uint32_t maxvote);
+	unsigned long *hashes_done, uint32_t maxvote, int blocklen);
 
 extern int scanhash_fugue256(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,