update to version v0.5 (2014-03-27)

11 years ago · 2ca6ede92b
17 changed files with 588 additions and 465 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -34,4 +34,4 @@ ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
--- a/Makefile.in
+++ b/Makefile.in
@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release
+ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization"
 -------------------------------------------------------------
 ***************************************************************
@ -38,6 +38,11 @@ its command line interface and options.
                          fugue256    use to mine Fuguecoin
                          groestl     use to mine Groestlcoin
  -d, --devices         gives a comma separated list of CUDA device IDs
                        to operate on. Device IDs start counting from 0!
                        Alternatively give string names of your card like
                        gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
  -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")
  -O, --userpass=U:P    username:password pair for mining server
  -u, --user=USERNAME   username for mining server
@ -63,8 +68,10 @@ its command line interface and options.
  -V, --version         display version information and exit
  -h, --help            display this help text and exit
 >>> Examples <<<
 Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
 ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
@ -107,6 +114,17 @@ from your old clunkers.
 >>> RELEASE HISTORY <<<
  March, 27 2014  Heavycoin exchange rates soar, and as a result this coin
                  gets some love: We greatly optimized the Hefty1 kernel
                  for speed. Expect some hefty gains, especially on 750Ti's!
                  By popular demand, we added the -d option as known from
                  cudaminer.
                  different compute capability builds are now provided until
                  we figure out how to pack everything into a single executable
                  in a Windows build.
  March, 24 2014  fixed Groestl pool support
                  went back to Compute 1.x for cuda_hefty1.cu kernel by
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <CInterleavedPTX>true</CInterleavedPTX>
    </CudaCompile>
    <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
      <Include>
      </Include>
      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <CInterleavedPTX>true</CInterleavedPTX>
    </CudaCompile>
    <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
      <Include>
      </Include>
      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <CInterleavedPTX>true</CInterleavedPTX>
    </CudaCompile>
    <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
      <Include>
      </Include>
      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <CInterleavedPTX>true</CInterleavedPTX>
    </CudaCompile>
    <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
      <Include>
      </Include>
      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
    <CudaCompile Include="cuda_fugue256.cu" />
    <CudaCompile Include="cuda_groestl512.cu" />
    <CudaCompile Include="cuda_groestlcoin.cu" />
-    <CudaCompile Include="cuda_hefty1.cu">
+    <CudaCompile Include="cuda_hefty1.cu" />
      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">124</MaxRegCount>
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">124</MaxRegCount>
      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">124</MaxRegCount>
      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">124</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="cuda_keccak512.cu" />
    <CudaCompile Include="cuda_sha256.cu" />
    <CudaCompile Include="heavy.cu" />
--- a/20
+++ b/20
@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24.
+# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27.
 #
 #
 # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@ -557,8 +557,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ccminer'
 PACKAGE_TARNAME='ccminer'
-PACKAGE_VERSION='2014.03.24'
+PACKAGE_VERSION='2014.03.27'
-PACKAGE_STRING='ccminer 2014.03.24'
+PACKAGE_STRING='ccminer 2014.03.27'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems.
+\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1368,7 +1368,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of ccminer 2014.03.24:";;
+     short | recursive ) echo "Configuration of ccminer 2014.03.27:";;
   esac
  cat <<\_ACEOF
@ -1469,7 +1469,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-ccminer configure 2014.03.24
+ccminer configure 2014.03.27
 generated by GNU Autoconf 2.68
 Copyright (C) 2010 Free Software Foundation, Inc.
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by ccminer $as_me 2014.03.24, which was
+It was created by ccminer $as_me 2014.03.27, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
  $ $0 $@
@ -2901,7 +2901,7 @@ fi
 # Define the identity of the package.
 PACKAGE='ccminer'
- VERSION='2014.03.24'
+ VERSION='2014.03.27'
 cat >>confdefs.h <<_ACEOF
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ccminer $as_me 2014.03.24, which was
+This file was extended by ccminer $as_me 2014.03.27, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@ -7184,7 +7184,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ccminer config.status 2014.03.24
+ccminer config.status 2014.03.27
 configured by $0, generated by GNU Autoconf 2.68,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.03.24])
+AC_INIT([ccminer], [2014.03.27])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -51,8 +51,13 @@
 // from heavy.cu
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 int cuda_num_devices();
 int cuda_finddevice(char *name);
 #ifdef __cplusplus
 }
 #endif
 #ifdef __linux /* Linux specific policy and affinity management */
@ -144,10 +149,11 @@ static int opt_scantime = 5;
 static json_t *opt_config;
 static const bool opt_time = true;
 static sha256_algos opt_algo = ALGO_HEAVY;
-static int opt_n_threads;
+static int opt_n_threads = 0;
 bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
 static int num_processors;
 int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 static char *rpc_url;
 static char *rpc_userpass;
 static char *rpc_user, *rpc_pass;
@ -185,7 +191,11 @@ Options:\n\
  -a, --algo=ALGO       specify the algorithm to use\n\
                        fugue256  Fuguecoin hash\n\
                        heavy     Heavycoin hash\n\
-  -v, --vote=VOTE       block reward vote\n\
+  -d, --devices         takes a comma separated list of CUDA devices to use.\n\
                        Device IDs start counting from 0! Alternatively takes\n\
                        string names of your cards like gtx780ti or gt640#2\n\
                        (matching 2nd gt640 in the PC)\n\
  -v, --vote=VOTE       block reward vote (for HeavyCoin)\n\
  -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
@ -227,7 +237,7 @@ static char const short_options[] =
 #ifdef HAVE_SYSLOG_H
 	"S"
 #endif
-	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:";
+	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
 static struct option const options[] = {
 	{ "algo", 1, NULL, 'a' },
@ -259,6 +269,7 @@ static struct option const options[] = {
 	{ "user", 1, NULL, 'u' },
 	{ "userpass", 1, NULL, 'O' },
 	{ "version", 0, NULL, 'V' },
 	{ "devices", 1, NULL, 'd' },
 	{ 0, 0, 0, 0 }
 };
@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg)
 	case 'S':
 		use_syslog = true;
 		break;
 	case 'd': // CB
 		{
 			char * pch = strtok (arg,",");
 			opt_n_threads = 0;
 			while (pch != NULL) {
 				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
 				{
 					if (atoi(pch) < num_processors)
 						device_map[opt_n_threads++] = atoi(pch);
 					else {
 						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
 						exit(1);
 					}
 				} else {
 					int device = cuda_finddevice(pch);
 					if (device >= 0 && device < num_processors)
 						device_map[opt_n_threads++] = device;
 					else {
 						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
 						exit(1);
 					}
 				}
 				pch = strtok (NULL, ",");
 			}
 		}
 		break;
 	case 'V':
 		show_version_and_exit();
 	case 'h':
@ -1346,7 +1383,7 @@ static void signal_handler(int sig)
 }
 #endif
-#define PROGRAM_VERSION "0.4"
+#define PROGRAM_VERSION "0.5"
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
@ -1370,6 +1407,9 @@ int main(int argc, char *argv[])
 	rpc_user = strdup("");
 	rpc_pass = strdup("");
 	pthread_mutex_init(&applog_lock, NULL);
 	num_processors = cuda_num_devices();
 	/* parse command line */
 	parse_cmdline(argc, argv);
@ -1385,7 +1425,6 @@ int main(int argc, char *argv[])
 		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
 	}
 	pthread_mutex_init(&applog_lock, NULL);
 	pthread_mutex_init(&stats_lock, NULL);
 	pthread_mutex_init(&g_work_lock, NULL);
 	pthread_mutex_init(&stratum.sock_lock, NULL);
@ -1416,7 +1455,6 @@ int main(int argc, char *argv[])
 	}
 #endif
 	num_processors = cuda_num_devices();
 	if (num_processors == 0)
 	{
 		applog(LOG_ERR, "No CUDA devices found! terminating.");
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -152,7 +152,7 @@
 #define PACKAGE_NAME "ccminer"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.03.24"
+#define PACKAGE_STRING "ccminer 2014.03.27"
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
 #undef PACKAGE_URL
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.03.24"
+#define PACKAGE_VERSION "2014.03.27"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/cuda_blake512.cu
+++ b/cuda_blake512.cu
@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata)
 __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 {
-	const int threadsperblock = 128;
+	const int threadsperblock = 256;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/cuda_combine.cu
+++ b/cuda_combine.cu
@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/cuda_fugue256.cu
+++ b/cuda_fugue256.cu
@ -9,7 +9,10 @@
 #define USE_SHARED 1
-// heavy.cu
+// aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // Folgende Definitionen später durch header ersetzen
@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 void fugue256_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 4 * 256 * sizeof(uint32_t);
 #else
--- a/cuda_groestl512.cu
+++ b/cuda_groestl512.cu
@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -9,6 +9,10 @@
 #define USE_SHARED 1
 // aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // Folgende Definitionen später durch header ersetzen
@ -20,13 +24,7 @@ typedef unsigned long long uint64_t;
 __constant__ uint32_t pTarget[8]; // Single GPU
 extern uint32_t *d_resultNonce[8];
 // globaler Speicher für unsere Ergebnisse
 uint32_t *d_hashGROESTLCOINoutput[8];
 __constant__ uint32_t groestlcoin_gpu_state[32];
 __constant__ uint32_t groestlcoin_gpu_msg[32];
 __constant__ uint32_t sha256coin_gpu_constantTable[64];
 __constant__ uint32_t sha256coin_gpu_register[8];
 #define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 #if __CUDA_ARCH__ < 350 
    // Kepler (Compute 3.0)
    #define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #else
    // Kepler (Compute 3.5)
    #define S(x, n) __funnelshift_r( x, x, n );
 #endif
 #define R(x, n)			((x) >> (n))
 #define Ch(x, y, z)		((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)	((x & (y | z)) | (y & z))
@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[];
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
+__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
 {
 	uint32_t t[32];
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
-#pragma unroll 16
+		switch(r)
 		for(int k=0;k<16;k++)
 		{
-			a[(k*2)+0] ^= PC32up(k * 0x10, r);
+			case 0:
-			//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
+#pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
 			case 1:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
 			case 2:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
 			case 3:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
 			case 4:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
 			case 5:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
 			case 6:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
 			case 7:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
 			case 8:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
 			case 9:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
 			case 10:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
 			case 11:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
 			case 12:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
 			case 13:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
 		}
 		// RBTT
@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
 	}
 }
-__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
+__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
 {	
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
 		uint32_t t[32];
-#pragma unroll 16
+		switch(r)
 		for(int k=0;k<16;k++)
 		{
-			a[(k*2)+0] ^= QC32up(k * 0x10, r);
+			case 0:
-			a[(k*2)+1] ^= QC32dn(k * 0x10, r);
+	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
 			case 1:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
 			case 2:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
 			case 3:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
 			case 4:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
 			case 5:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
 			case 6:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
 			case 7:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
 			case 8:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
 			case 9:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
 			case 10:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
 			case 11:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
 			case 12:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
 			case 13:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
 		}
 		// RBTT
@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
 	}
 }
 #if USE_SHARED
-__global__ void  __launch_bounds__(256) 
+__global__ void  /* __launch_bounds__(256) */
 #else
 __global__ void 
 #endif
- groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+ groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
@ -204,72 +286,52 @@ __global__ void
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 	/////
 	///// Lieber groestl, mach, dass es abgeht!!!
 	/////
 		// GROESTL
 		uint32_t message[32];
 		uint32_t state[32];
 		uint32_t g[32];
 #pragma unroll 32
-		for(int k=0;k<32;k++)
+		for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
 		{
                        // TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem
                        //       constant Memory zu lesen. Das ist Verschwendung von Bandbreite.
 			state[k] = groestlcoin_gpu_state[k];
 			message[k] = groestlcoin_gpu_msg[k];
 		}
 		uint32_t nounce = startNounce + thread;
 		message[19] = SWAB32(nounce);
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] = message[u];
-			g[u] = message[u] ^ state[u];  // TODO: state ist fast ueberall 0.
+		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);        // TODO: g[] entspricht fast genau message[]
+		groestlcoin_perm_P(state, mixtabs);
-		groestlcoin_perm_Q(message, mixtabs);  //       kann man das ausnutzen?
+		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
 		for(int u=0;u<32;u++) state[u] ^= message[u];
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) message[u] = state[u];
 		{
                        // TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion
                        //       was den Registerbedarf senken koennte?
 			state[u] ^= g[u] ^ message[u];
 			g[u] = state[u];
 		}
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(message, NULL);
 #endif
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] ^= message[u];
 			state[u] ^= g[u];
 		////
 		//// 2. Runde groestl
 		////
 #pragma unroll 16
-		for(int k=0;k<16;k++)
+		for(int k=0;k<16;k++) message[k] = state[k + 16];
-			message[k] = state[k + 16];
+#pragma unroll 14
-
+		for(int k=1;k<15;k++)
 #pragma unroll 32
 		for(int k=0;k<32;k++)
 			state[k] = groestlcoin_gpu_state[k];
 #pragma unroll 16
 		for(int k=0;k<16;k++)
 			message[k+16] = 0;
 		message[16] = 0x80;
@ -277,73 +339,58 @@ __global__ void
 #pragma unroll 32
 		for(int u=0;u<32;u++)
-			g[u] = message[u] ^ state[u];
+			state[u] = message[u];
 		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(state, mixtabs);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] ^= message[u];
-		{
+
-			state[u] ^= g[u] ^ message[u];
+#pragma unroll 32
-			g[u] = state[u];
+		for(int u=0;u<32;u++) message[u] = state[u];
 		}
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(message, NULL);
 #endif
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] ^= message[u];
 			state[u] ^= g[u];
 /*
 	#pragma unroll 8
 		for(int k=0;k<8;k++)
 			hash[k] = state[k+16];
 */
 		// kopiere Ergebnis
-		/*
+		int i, position = -1;
 #pragma unroll 16
 		for(int k=0;k<16;k++)
 			((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
 			*/
 		int i;
 		bool rc = true;
 #pragma unroll 8
 		for (i = 7; i >= 0; i--) {
 			if (state[i+16] > pTarget[i]) {
 				if(position < i) {
 					position = i;
 					rc = false;
-				break;
+				}
 	 		}
 	 		if (state[i+16] < pTarget[i]) {
 				if(position < i) {
 					position = i;
 					rc = true;
-				break;
+				}
 	 		}
 		}
 		if(rc == true)
 		{
 			if(resNounce[0] > nounce)
 			{
 				resNounce[0] = nounce;
 				/*
 				#pragma unroll 8
 				for(int k=0;k<8;k++)					
 					((uint32_t*)outputHash)[k] = (hash[k]);
 				*/
 			}
 		}
 	}
 }
@ -360,7 +407,7 @@ __global__ void
 // Setup-Funktionen
 __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
 	cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
 // Texturen mit obigem Makro initialisieren
 	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
 	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
-	// setze register 
+	// Speicher für Gewinner-Nonce belegen
        // TODO: fast vollstaendige Vorbelegung mit Nullen.
        //       da besteht doch Optimierungspotenzial im GPU Kernel
        //       denn mit Nullen braucht man nicht wirklich rechnen.
 	uint32_t groestl_state_init[32];
 	memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
 	groestl_state_init[31] = 0x20000;
 	// state speichern
 	cudaMemcpyToSymbol(	groestlcoin_gpu_state,
 						groestl_state_init,
 						128);
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 	// Speicher für alle Ergebnisse belegen (nur für Debug)
 	cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
 }
 __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	//fprintf(stderr, "ThrID: %d\n", thr_id);
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]);
+	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	MyStreamSynchronize(NULL, 0, thr_id);
 	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	/// Debug
 	//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
 	// Nounce
 	//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
--- a/cuda_hefty1.cu
+++ b/cuda_hefty1.cu
@ -2,26 +2,40 @@
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 // aus cpu-miner.c
 extern int device_map[8];
 #include <stdio.h>
 #include <memory.h>
 #define USE_SHARED 1
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned int uint32_t;
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props;
 // globaler Speicher für alle HeftyHashes aller Threads
 uint32_t *d_heftyHashes[8];
 /* Hash-Tabellen */
 __constant__ uint32_t hefty_gpu_constantTable[64];
 #if USE_SHARED
 #define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
 #else
 #define heftyLookUp(x) hefty_gpu_constantTable[x]
 #endif
 // muss expandiert werden
 __constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
 __constant__ uint32_t hefty_gpu_register[8];
 __constant__ uint32_t hefty_gpu_sponge[4];
-uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
+uint32_t hefty_cpu_hashTable[] = {
    0x6a09e667UL,
    0xbb67ae85UL,
    0x3c6ef372UL,
    0xa54ff53aUL,
@ -29,6 +43,7 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
    0x9b05688cUL,
    0x1f83d9abUL,
    0x5be0cd19UL };
 uint32_t hefty_cpu_constantTable[] = {
    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
@ -48,7 +63,11 @@ uint32_t hefty_cpu_constantTable[] = {
    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
 };
-#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
+//#define S(x, n)          (((x) >> (n)) | ((x) << (32 - (n))))
 static __host__ __device__ uint32_t S(uint32_t x, int n)
 {
    return (((x) >> (n)) | ((x) << (32 - (n))));
 }
 #define R(x, n)          ((x) >> (n))
 #define Ch(x, y, z)      ((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)     ((x & (y | z)) | (y & z))
@ -65,36 +84,50 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
 {
    uint16_t w = (x >> 16) ^ (x & 0xffff);
    uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
-	return (n >> 2) ^ (n & 0x03);
+    return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
 }
 // 4 auf einmal
 #define smoosh4Quad(x)   ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
 #define getByte(x,y)     ( ((x) >> (y)) & 0xFF )
-__host__ __device__ void Mangle(uint32_t *inp)
+__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
 {
    uint32_t r = smoosh4Quad(inp[0]);
-	//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) );
+    uint32_t inp0org;
-	//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) );
+    uint32_t tmp0Mask, tmp1Mask;
-	//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) );
+    uint32_t in1, in2, isAddition;
-	//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) );
+    uint32_t tmp;
    uint8_t b;
    inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
-	switch (smoosh2(inp[1])) {
+    r += 0x01010101;
-      case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break;
+    tmp = smoosh2(inp[1]);
-      case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break;
+    b = getByte(r,tmp);
-      case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break;
+    inp0org = S(inp[0], b);
-      case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break;
+    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
-    }
+    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
-    
+    
-	uint32_t tmp = smoosh2(inp[1] ^ inp[2]);
+    in1 =    (inp[2] & ~inp0org) | 
-    switch (tmp) {
+            (tmp1Mask & ~inp[2] & inp0org) |
-      case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break;
+            (~tmp0Mask & ~inp[2] & inp0org);
-      case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break;
+    in2 = inp[2] += ~inp0org;
-      case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break;
+    isAddition = ~tmp0Mask & tmp1Mask;
-      case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break;
+    inp[2] = isAddition ? in2 : in1;
-    }
+    
    r += 0x01010101;
    tmp = smoosh2(inp[1] ^ inp[2]);
    b = getByte(r,tmp);
    inp0org = S(inp[0], b);
    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
    in1 =    (inp[3] & ~inp0org) | 
            (tmp1Mask & ~inp[3] & inp0org) |
            (~tmp0Mask & ~inp[3] & inp0org);
    in2 = inp[3] += ~inp0org;
    isAddition = ~tmp0Mask & tmp1Mask;
    inp[3] = isAddition ? in2 : in1;
    inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
 }
@ -115,38 +148,15 @@ __host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
 __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
 {
    uint32_t r = Squeeze(sponge);
    uint32_t t = ((r >> 8) & 0x1F);
    uint32_t y = 1 << t;
-	//uint8_t r0 = r >> 8;
+    uint32_t a = (((r>>1) & 0x01) << t) & y;
-	uint8_t r1 = r & 0xFF;
+    uint32_t b = ((r & 0x01) << t) & y;
-	uint32_t y = 1 << ((r >> 8) & 0x1F);
+    uint32_t c = x & y;
 	//uint32_t retVal;
 	//retVal = x;
 	uint32_t resArr[4];
 	resArr[0] = x;
 	resArr[1] = x & ~y;
 	resArr[2] = x | y;
 	resArr[3] = x ^ y;
 	return resArr[r1 & 0x03];
-	/*
+    uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
 	switch(r1 & 0x03)
 	{
 	case 0:
 		break;
    case 1:
        retVal = x & ~y;
 		break;
    case 2:
        retVal = x | y;
 		break;
    case 3:
        retVal = x ^ y;
 		break;
    }
    return retVal;
 	*/
 }
 __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
@ -197,6 +207,16 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *
 // Die Hash-Funktion
 __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
 {
    #if USE_SHARED
    extern __shared__ char heftytab[];
    if(threadIdx.x < 64)
    {
        *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
    }
    __syncthreads();
 #endif
    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
    if (thread < threads)
    {
@ -204,14 +224,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
        uint32_t nounce = startNounce + thread;
        // jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
 #if USE_SHARED
 		extern __shared__ unsigned char s[];
 		uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
 #else
        // reduktion von 256 byte auf 128 byte
        uint32_t W1[16];
        uint32_t W2[16];
 #endif
        // Initialisiere die register a bis h mit der Hash-Tabelle
        uint32_t regs[8];
@ -236,18 +251,17 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
            W1[k] = hefty_gpu_blockHeader[k];
        W1[3] = SWAB32(nounce);
        // 2. Runde
 #pragma unroll 16
        for(int j=0;j<16;j++)
-			Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]);
+            Absorb(sponge, W1[j] ^ heftyLookUp(j));
 // Progress W1 (Bytes 0...63)
 #pragma unroll 16
        for(int j=0;j<16;j++)
        {
            Absorb(sponge, regs[3] ^ regs[7]);
-			hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge);
+            hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
        }
 // Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
@ -272,14 +286,13 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
            for(int j=0;j<16;j++)
            {
                Absorb(sponge, regs[3] + regs[7]);
-				hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge);
+                hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
            }
    #pragma unroll 16
            for(int j=0;j<16;j++)
                W1[j] = W2[j];
        }
 #pragma unroll 8
        for(int k=0;k<8;k++)
            hash[k] += regs[k];
@ -293,7 +306,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
 // Setup-Funktionen
 __host__ void hefty_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
    cudaGetDeviceProperties(&props, device_map[thr_id]);
    // Kopiere die Hash-Tabellen in den GPU-Speicher
    cudaMemcpyToSymbol(    hefty_gpu_constantTable,
@ -378,15 +393,17 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
 __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
 {
-	const int threadsperblock = 128;
+    // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
    // alle anderen mit 512 Threads.
    int threadsperblock = (props.major >= 3) ? 768 : 512;
    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid((threads + threadsperblock-1)/threadsperblock);
    dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+    // Größe des dynamischen Shared Memory Bereichs
    #if USE_SHARED
-	size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingefügt gegen Bank Konflikte
+    size_t shared_size = 8 * 64 * sizeof(uint32_t);
 #else
    size_t shared_size = 0;
 #endif
--- a/cuda_keccak512.cu
+++ b/cuda_keccak512.cu
@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/cuda_sha256.cu
+++ b/cuda_sha256.cu
@ -5,8 +5,6 @@
 #include <stdio.h>
 #include <memory.h>
 #define W_ALIGNMENT 65
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned int uint32_t;
@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
 		nonceVector[thread] = nounce;
 		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
 		//extern __shared__ unsigned char s[];
 		//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
 		uint32_t W1[16];
 		uint32_t W2[16];
@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
 __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
 {
-	const int threadsperblock = 128;
+	const int threadsperblock = 256;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingefügt gegen Bank Konflikte
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/heavy.cu
+++ b/heavy.cu
@ -163,6 +163,41 @@ extern "C" int cuda_num_devices()
    return GPU_N;
 }
 static bool substringsearch(const char *haystack, const char *needle, int &match)
 {
    int hlen = strlen(haystack);
    int nlen = strlen(needle);
    for (int i=0; i < hlen; ++i)
    {
        if (haystack[i] == ' ') continue;
        int j=0, x = 0;
        while(j < nlen)
        {
            if (haystack[i+x] == ' ') {++x; continue;}
            if (needle[j] == ' ') {++j; continue;}
            if (needle[j] == '#') return ++match == needle[j+1]-'0';
            if (tolower(haystack[i+x]) != tolower(needle[j])) break;
            ++j; ++x;
        }
        if (j == nlen) return true;
    }
    return false;
 }
 // CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
 extern "C" int cuda_finddevice(char *name)
 {
    int num = cuda_num_devices();
    int match = 0;
    for (int i=0; i < num; ++i)
    {
        cudaDeviceProp props;
        if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
            if (substringsearch(props.name, name, match)) return i;
    }
    return -1;
 }
 // Zeitsynchronisations-Routine von cudaminer mit CPU sleep
 typedef struct { double value[8]; } tsumarray;
 cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)