update to version v0.5 (2014-03-27)

2014-03-27 00:47:12 +01:00 · 2014-03-27 00:47:12 +01:00 · 2ca6ede92b
commit 2ca6ede92b
parent 1bb78f0258
17 changed files with 566 additions and 443 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -34,4 +34,4 @@ ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
--- a/Makefile.in
+++ b/Makefile.in
@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release
+ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization"
 -------------------------------------------------------------
 ***************************************************************
@ -38,6 +38,11 @@ its command line interface and options.
                          fugue256    use to mine Fuguecoin
                          groestl     use to mine Groestlcoin
  -d, --devices         gives a comma separated list of CUDA device IDs
                        to operate on. Device IDs start counting from 0!
                        Alternatively give string names of your card like
                        gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
  -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")
  -O, --userpass=U:P    username:password pair for mining server
  -u, --user=USERNAME   username for mining server
@ -63,8 +68,10 @@ its command line interface and options.
  -V, --version         display version information and exit
  -h, --help            display this help text and exit
 >>> Examples <<<
 Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
 ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
@ -107,22 +114,33 @@ from your old clunkers.
 >>> RELEASE HISTORY <<<
-  March, 24 2014 fixed Groestl pool support
+  March, 27 2014  Heavycoin exchange rates soar, and as a result this coin
                  gets some love: We greatly optimized the Hefty1 kernel
                  for speed. Expect some hefty gains, especially on 750Ti's!
-                 went back to Compute 1.x for cuda_hefty1.cu kernel by
+                  By popular demand, we added the -d option as known from
-                 default after numerous reports of ccminer v0.2/v0.3
+                  cudaminer.
                 not working with HeavyCoin for some people.
-  March, 23 2014 added Groestlcoin support. stratum status unknown
+                  different compute capability builds are now provided until
-                 (the only pool is currently down for fixing issues)
+                  we figure out how to pack everything into a single executable
                  in a Windows build.
-  March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
+  March, 24 2014  fixed Groestl pool support
                 on Fermi and Maxwell devices. Kepler may suffer slightly
                 (3-5%)
-                 Fixed Stratum for Fuguecoin. Tested on dwarfpool.
+                  went back to Compute 1.x for cuda_hefty1.cu kernel by
                  default after numerous reports of ccminer v0.2/v0.3
                  not working with HeavyCoin for some people.
-  March, 18 2014 initial release.
+  March, 23 2014  added Groestlcoin support. stratum status unknown
                  (the only pool is currently down for fixing issues)
  March, 21 2014  use of shared memory in Fugue256 kernel boosts hash rates
                  on Fermi and Maxwell devices. Kepler may suffer slightly
                  (3-5%)
                  Fixed Stratum for Fuguecoin. Tested on dwarfpool.
  March, 18 2014  initial release.
 >>> AUTHORS <<<
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <CInterleavedPTX>true</CInterleavedPTX>
    </CudaCompile>
    <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
      <Include>
      </Include>
      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <CInterleavedPTX>true</CInterleavedPTX>
    </CudaCompile>
    <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
      <Include>
      </Include>
      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <CInterleavedPTX>true</CInterleavedPTX>
    </CudaCompile>
    <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
      <Include>
      </Include>
      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
      <CInterleavedPTX>true</CInterleavedPTX>
    </CudaCompile>
    <CudaCompile>
-      <MaxRegCount>63</MaxRegCount>
+      <MaxRegCount>80</MaxRegCount>
    </CudaCompile>
    <CudaCompile>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_20,sm_20</CodeGeneration>
+      <CodeGeneration>compute_35,sm_35</CodeGeneration>
      <Include>
      </Include>
      <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
    <CudaCompile Include="cuda_fugue256.cu" />
    <CudaCompile Include="cuda_groestl512.cu" />
    <CudaCompile Include="cuda_groestlcoin.cu" />
-    <CudaCompile Include="cuda_hefty1.cu">
+    <CudaCompile Include="cuda_hefty1.cu" />
      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">124</MaxRegCount>
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">124</MaxRegCount>
      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">124</MaxRegCount>
      <CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
      <MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">124</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="cuda_keccak512.cu" />
    <CudaCompile Include="cuda_sha256.cu" />
    <CudaCompile Include="heavy.cu" />
--- a/20
+++ b/20
@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24.
+# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27.
 #
 #
 # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@ -557,8 +557,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ccminer'
 PACKAGE_TARNAME='ccminer'
-PACKAGE_VERSION='2014.03.24'
+PACKAGE_VERSION='2014.03.27'
-PACKAGE_STRING='ccminer 2014.03.24'
+PACKAGE_STRING='ccminer 2014.03.27'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems.
+\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems.
 Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1368,7 +1368,7 @@ fi
 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of ccminer 2014.03.24:";;
+     short | recursive ) echo "Configuration of ccminer 2014.03.27:";;
   esac
  cat <<\_ACEOF
@ -1469,7 +1469,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-ccminer configure 2014.03.24
+ccminer configure 2014.03.27
 generated by GNU Autoconf 2.68
 Copyright (C) 2010 Free Software Foundation, Inc.
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
-It was created by ccminer $as_me 2014.03.24, which was
+It was created by ccminer $as_me 2014.03.27, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
  $ $0 $@
@ -2901,7 +2901,7 @@ fi
 # Define the identity of the package.
 PACKAGE='ccminer'
- VERSION='2014.03.24'
+ VERSION='2014.03.27'
 cat >>confdefs.h <<_ACEOF
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ccminer $as_me 2014.03.24, which was
+This file was extended by ccminer $as_me 2014.03.27, which was
 generated by GNU Autoconf 2.68.  Invocation command line was
  CONFIG_FILES    = $CONFIG_FILES
@ -7184,7 +7184,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ccminer config.status 2014.03.24
+ccminer config.status 2014.03.27
 configured by $0, generated by GNU Autoconf 2.68,
  with options \\"\$ac_cs_config\\"
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.03.24])
+AC_INIT([ccminer], [2014.03.27])
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -51,8 +51,13 @@
 // from heavy.cu
 #ifdef __cplusplus
 extern "C"
 {
 #endif
 int cuda_num_devices();
 int cuda_finddevice(char *name);
 #ifdef __cplusplus
 }
 #endif
 #ifdef __linux /* Linux specific policy and affinity management */
@ -144,10 +149,11 @@ static int opt_scantime = 5;
 static json_t *opt_config;
 static const bool opt_time = true;
 static sha256_algos opt_algo = ALGO_HEAVY;
-static int opt_n_threads;
+static int opt_n_threads = 0;
 bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
 static int num_processors;
 int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 static char *rpc_url;
 static char *rpc_userpass;
 static char *rpc_user, *rpc_pass;
@ -185,7 +191,11 @@ Options:\n\
  -a, --algo=ALGO       specify the algorithm to use\n\
                        fugue256  Fuguecoin hash\n\
                        heavy     Heavycoin hash\n\
-  -v, --vote=VOTE       block reward vote\n\
+  -d, --devices         takes a comma separated list of CUDA devices to use.\n\
                        Device IDs start counting from 0! Alternatively takes\n\
                        string names of your cards like gtx780ti or gt640#2\n\
                        (matching 2nd gt640 in the PC)\n\
  -v, --vote=VOTE       block reward vote (for HeavyCoin)\n\
  -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
  -o, --url=URL         URL of mining server\n\
  -O, --userpass=U:P    username:password pair for mining server\n\
@ -227,7 +237,7 @@ static char const short_options[] =
 #ifdef HAVE_SYSLOG_H
 	"S"
 #endif
-	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:";
+	"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
 static struct option const options[] = {
 	{ "algo", 1, NULL, 'a' },
@ -259,6 +269,7 @@ static struct option const options[] = {
 	{ "user", 1, NULL, 'u' },
 	{ "userpass", 1, NULL, 'O' },
 	{ "version", 0, NULL, 'V' },
 	{ "devices", 1, NULL, 'd' },
 	{ 0, 0, 0, 0 }
 };
@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg)
 	case 'S':
 		use_syslog = true;
 		break;
 	case 'd': // CB
 		{
 			char * pch = strtok (arg,",");
 			opt_n_threads = 0;
 			while (pch != NULL) {
 				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
 				{
 					if (atoi(pch) < num_processors)
 						device_map[opt_n_threads++] = atoi(pch);
 					else {
 						applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
 						exit(1);
 					}
 				} else {
 					int device = cuda_finddevice(pch);
 					if (device >= 0 && device < num_processors)
 						device_map[opt_n_threads++] = device;
 					else {
 						applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
 						exit(1);
 					}
 				}
 				pch = strtok (NULL, ",");
 			}
 		}
 		break;
 	case 'V':
 		show_version_and_exit();
 	case 'h':
@ -1346,7 +1383,7 @@ static void signal_handler(int sig)
 }
 #endif
-#define PROGRAM_VERSION "0.4"
+#define PROGRAM_VERSION "0.5"
 int main(int argc, char *argv[])
 {
 	struct thr_info *thr;
@ -1370,6 +1407,9 @@ int main(int argc, char *argv[])
 	rpc_user = strdup("");
 	rpc_pass = strdup("");
 	pthread_mutex_init(&applog_lock, NULL);
 	num_processors = cuda_num_devices();
 	/* parse command line */
 	parse_cmdline(argc, argv);
@ -1385,7 +1425,6 @@ int main(int argc, char *argv[])
 		sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
 	}
 	pthread_mutex_init(&applog_lock, NULL);
 	pthread_mutex_init(&stats_lock, NULL);
 	pthread_mutex_init(&g_work_lock, NULL);
 	pthread_mutex_init(&stratum.sock_lock, NULL);
@ -1416,7 +1455,6 @@ int main(int argc, char *argv[])
 	}
 #endif
 	num_processors = cuda_num_devices();
 	if (num_processors == 0)
 	{
 		applog(LOG_ERR, "No CUDA devices found! terminating.");
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -152,7 +152,7 @@
 #define PACKAGE_NAME "ccminer"
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.03.24"
+#define PACKAGE_STRING "ccminer 2014.03.27"
 /* Define to the one symbol short name of this package. */
 #undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
 #undef PACKAGE_URL
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.03.24"
+#define PACKAGE_VERSION "2014.03.27"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/cuda_blake512.cu
+++ b/cuda_blake512.cu
@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata)
 __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 {
-	const int threadsperblock = 128;
+	const int threadsperblock = 256;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/cuda_combine.cu
+++ b/cuda_combine.cu
@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/cuda_fugue256.cu
+++ b/cuda_fugue256.cu
@ -9,7 +9,10 @@
 #define USE_SHARED 1
-// heavy.cu
+// aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // Folgende Definitionen später durch header ersetzen
@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 void fugue256_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 4 * 256 * sizeof(uint32_t);
 #else
--- a/cuda_groestl512.cu
+++ b/cuda_groestl512.cu
@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -9,6 +9,10 @@
 #define USE_SHARED 1
 // aus cpu-miner.c
 extern int device_map[8];
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 // Folgende Definitionen später durch header ersetzen
@ -20,13 +24,7 @@ typedef unsigned long long uint64_t;
 __constant__ uint32_t pTarget[8]; // Single GPU
 extern uint32_t *d_resultNonce[8];
 // globaler Speicher für unsere Ergebnisse
 uint32_t *d_hashGROESTLCOINoutput[8];
 __constant__ uint32_t groestlcoin_gpu_state[32];
 __constant__ uint32_t groestlcoin_gpu_msg[32];
 __constant__ uint32_t sha256coin_gpu_constantTable[64];
 __constant__ uint32_t sha256coin_gpu_register[8];
 #define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
-#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
+#if __CUDA_ARCH__ < 350 
    // Kepler (Compute 3.0)
    #define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #else
    // Kepler (Compute 3.5)
    #define S(x, n) __funnelshift_r( x, x, n );
 #endif
 #define R(x, n)			((x) >> (n))
 #define Ch(x, y, z)		((x & (y ^ z)) ^ z)
 #define Maj(x, y, z)	((x & (y | z)) | (y & z))
@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[];
 #define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
-__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
+__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
 {
 	uint32_t t[32];
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
-#pragma unroll 16
+		switch(r)
 		for(int k=0;k<16;k++)
 		{
-			a[(k*2)+0] ^= PC32up(k * 0x10, r);
+			case 0:
-			//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
+#pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
 			case 1:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
 			case 2:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
 			case 3:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
 			case 4:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
 			case 5:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
 			case 6:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
 			case 7:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
 			case 8:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
 			case 9:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
 			case 10:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
 			case 11:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
 			case 12:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
 			case 13:
 #pragma unroll 16
 				for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
 		}
 		// RBTT
@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
 	}
 }
-__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
+__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
 {	
 //#pragma unroll 14
 	for(int r=0;r<14;r++)
 	{
 		uint32_t t[32];
-#pragma unroll 16
+		switch(r)
 		for(int k=0;k<16;k++)
 		{
-			a[(k*2)+0] ^= QC32up(k * 0x10, r);
+			case 0:
-			a[(k*2)+1] ^= QC32dn(k * 0x10, r);
+	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
 			case 1:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
 			case 2:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
 			case 3:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
 			case 4:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
 			case 5:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
 			case 6:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
 			case 7:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
 			case 8:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
 			case 9:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
 			case 10:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
 			case 11:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
 			case 12:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
 			case 13:
 	#pragma unroll 16
 				for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
 		}
 		// RBTT
@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
 	}
 }
 #if USE_SHARED
-__global__ void  __launch_bounds__(256) 
+__global__ void  /* __launch_bounds__(256) */
 #else
 __global__ void 
 #endif
- groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+ groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
@ -204,146 +286,111 @@ __global__ void
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 	/////
 	///// Lieber groestl, mach, dass es abgeht!!!
 	/////
 		// GROESTL
 		uint32_t message[32];
 		uint32_t state[32];
 		uint32_t g[32];
 #pragma unroll 32
-		for(int k=0;k<32;k++)
+		for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
 		{
                        // TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem
                        //       constant Memory zu lesen. Das ist Verschwendung von Bandbreite.
 			state[k] = groestlcoin_gpu_state[k];
 			message[k] = groestlcoin_gpu_msg[k];
 		}
 		uint32_t nounce = startNounce + thread;
 		message[19] = SWAB32(nounce);
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] = message[u];
-			g[u] = message[u] ^ state[u];  // TODO: state ist fast ueberall 0.
+		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);        // TODO: g[] entspricht fast genau message[]
+		groestlcoin_perm_P(state, mixtabs);
-		groestlcoin_perm_Q(message, mixtabs);  //       kann man das ausnutzen?
+		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] ^= message[u];
-		{
+
-                        // TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion
+#pragma unroll 32
-                        //       was den Registerbedarf senken koennte?
+		for(int u=0;u<32;u++) message[u] = state[u];
 			state[u] ^= g[u] ^ message[u];
 			g[u] = state[u];
 		}
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(message, NULL);
 #endif
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] ^= message[u];
 			state[u] ^= g[u];
 		////
 		//// 2. Runde groestl
 		////
 #pragma unroll 16
-		for(int k=0;k<16;k++)
+		for(int k=0;k<16;k++) message[k] = state[k + 16];
-			message[k] = state[k + 16];
+#pragma unroll 14
-
+		for(int k=1;k<15;k++)
 #pragma unroll 32
 		for(int k=0;k<32;k++)
 			state[k] = groestlcoin_gpu_state[k];
 #pragma unroll 16
 		for(int k=0;k<16;k++)
 			message[k+16] = 0;
-		message[16] = 0x80;		
+		message[16] = 0x80;
 		message[31] = 0x01000000;
 #pragma unroll 32
 		for(int u=0;u<32;u++)
-			g[u] = message[u] ^ state[u];
+			state[u] = message[u];
 		state[31] ^= 0x20000;
 		// Perm
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(state, mixtabs);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(state, NULL);
 		state[31] ^= 0x20000;
 		groestlcoin_perm_Q(message, NULL);
 #endif
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] ^= message[u];
-		{
+
-			state[u] ^= g[u] ^ message[u];
+#pragma unroll 32
-			g[u] = state[u];
+		for(int u=0;u<32;u++) message[u] = state[u];
 		}
 #if USE_SHARED
-		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_P(message, mixtabs);
 #else
-		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_P(message, NULL);
 #endif
 #pragma unroll 32
-		for(int u=0;u<32;u++)
+		for(int u=0;u<32;u++) state[u] ^= message[u];
 			state[u] ^= g[u];
 /*
 	#pragma unroll 8
 		for(int k=0;k<8;k++)
 			hash[k] = state[k+16];
 */
 		// kopiere Ergebnis
-		/*
+		int i, position = -1;
 #pragma unroll 16
 		for(int k=0;k<16;k++)
 			((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
 			*/
 		int i;
 		bool rc = true;
-	
+
 #pragma unroll 8
 		for (i = 7; i >= 0; i--) {
 			if (state[i+16] > pTarget[i]) {
-				rc = false;
+				if(position < i) {
-				break;
+					position = i;
-			}
+					rc = false;
-			if (state[i+16] < pTarget[i]) {
+				}
-				rc = true;
+	 		}
-				break;
+	 		if (state[i+16] < pTarget[i]) {
-			}
+				if(position < i) {
 					position = i;
 					rc = true;
 				}
 	 		}
 		}
 		if(rc == true)
 		{
 			if(resNounce[0] > nounce)
 			{
 				resNounce[0] = nounce;
 				/*
 				#pragma unroll 8
 				for(int k=0;k<8;k++)					
 					((uint32_t*)outputHash)[k] = (hash[k]);
 				*/
 			}
 		}
 	}
 }
@ -360,7 +407,7 @@ __global__ void
 // Setup-Funktionen
 __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
 	cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
 // Texturen mit obigem Makro initialisieren
 	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
 	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
 	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
-	// setze register 
+	// Speicher für Gewinner-Nonce belegen
        // TODO: fast vollstaendige Vorbelegung mit Nullen.
        //       da besteht doch Optimierungspotenzial im GPU Kernel
        //       denn mit Nullen braucht man nicht wirklich rechnen.
 	uint32_t groestl_state_init[32];
 	memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
 	groestl_state_init[31] = 0x20000;
 	// state speichern
 	cudaMemcpyToSymbol(	groestlcoin_gpu_state,
 						groestl_state_init,
 						128);
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 	// Speicher für alle Ergebnisse belegen (nur für Debug)
 	cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
 }
 __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 #if USE_SHARED
 	size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
 	//fprintf(stderr, "ThrID: %d\n", thr_id);
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
-	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]);
+	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	MyStreamSynchronize(NULL, 0, thr_id);
 	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	/// Debug
 	//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
 	// Nounce
 	//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
--- a/cuda_hefty1.cu
+++ b/cuda_hefty1.cu
@ -2,26 +2,40 @@
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 // aus cpu-miner.c
 extern int device_map[8];
 #include <stdio.h>
 #include <memory.h>
 #define USE_SHARED 1
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned int uint32_t;
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
 // diese Struktur wird in der Init Funktion angefordert
 static cudaDeviceProp props;
 // globaler Speicher für alle HeftyHashes aller Threads
 uint32_t *d_heftyHashes[8];
 /* Hash-Tabellen */
 __constant__ uint32_t hefty_gpu_constantTable[64];
 #if USE_SHARED
 #define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
 #else
 #define heftyLookUp(x) hefty_gpu_constantTable[x]
 #endif
 // muss expandiert werden
 __constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
 __constant__ uint32_t hefty_gpu_register[8];
 __constant__ uint32_t hefty_gpu_sponge[4];
-uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
+uint32_t hefty_cpu_hashTable[] = {
    0x6a09e667UL,
    0xbb67ae85UL,
    0x3c6ef372UL,
    0xa54ff53aUL,
@ -29,8 +43,9 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
    0x9b05688cUL,
    0x1f83d9abUL,
    0x5be0cd19UL };
 uint32_t hefty_cpu_constantTable[] = {
-	0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
+    0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
    0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
    0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
    0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
@ -48,350 +63,352 @@ uint32_t hefty_cpu_constantTable[] = {
    0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
 };
-#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
+//#define S(x, n)          (((x) >> (n)) | ((x) << (32 - (n))))
-#define R(x, n)			((x) >> (n))
+static __host__ __device__ uint32_t S(uint32_t x, int n)
-#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
+{
-#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+    return (((x) >> (n)) | ((x) << (32 - (n))));
-#define S0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
+}
-#define S1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
+#define R(x, n)          ((x) >> (n))
-#define s0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
+#define Ch(x, y, z)      ((x & (y ^ z)) ^ z)
-#define s1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))
+#define Maj(x, y, z)     ((x & (y | z)) | (y & z))
 #define S0(x)            (S(x, 2) ^ S(x, 13) ^ S(x, 22))
 #define S1(x)            (S(x, 6) ^ S(x, 11) ^ S(x, 25))
 #define s0(x)            (S(x, 7) ^ S(x, 18) ^ R(x, 3))
 #define s1(x)            (S(x, 17) ^ S(x, 19) ^ R(x, 10))
-#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+#define SWAB32(x)        ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
 // uint8_t
-#define smoosh4(x)		( ((x)>>4) ^ ((x) & 0x0F) )
+#define smoosh4(x)       ( ((x)>>4) ^ ((x) & 0x0F) )
 __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
 {
-	uint16_t w = (x >> 16) ^ (x & 0xffff);
+    uint16_t w = (x >> 16) ^ (x & 0xffff);
-	uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
+    uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
-	return (n >> 2) ^ (n & 0x03);
+    return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
 }
 // 4 auf einmal
-#define smoosh4Quad(x)	( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
+#define smoosh4Quad(x)   ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
-#define getByte(x,y)	( ((x) >> (y)) & 0xFF )
+#define getByte(x,y)     ( ((x) >> (y)) & 0xFF )
-__host__ __device__ void Mangle(uint32_t *inp)
+__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
 {
-	uint32_t r = smoosh4Quad(inp[0]);
+    uint32_t r = smoosh4Quad(inp[0]);
-	//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) );
+    uint32_t inp0org;
-	//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) );
+    uint32_t tmp0Mask, tmp1Mask;
-	//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) );
+    uint32_t in1, in2, isAddition;
-	//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) );
+    uint32_t tmp;
    uint8_t b;
-	inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
+    inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
-	
+
-	switch (smoosh2(inp[1])) {
+    r += 0x01010101;
-      case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break;
+    tmp = smoosh2(inp[1]);
-      case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break;
+    b = getByte(r,tmp);
-      case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break;
+    inp0org = S(inp[0], b);
-      case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break;
+    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
-    }
+    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
-	uint32_t tmp = smoosh2(inp[1] ^ inp[2]);
+    in1 =    (inp[2] & ~inp0org) | 
-    switch (tmp) {
+            (tmp1Mask & ~inp[2] & inp0org) |
-      case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break;
+            (~tmp0Mask & ~inp[2] & inp0org);
-      case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break;
+    in2 = inp[2] += ~inp0org;
-      case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break;
+    isAddition = ~tmp0Mask & tmp1Mask;
-      case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break;
+    inp[2] = isAddition ? in2 : in1;
-    }
+    
    r += 0x01010101;
    tmp = smoosh2(inp[1] ^ inp[2]);
    b = getByte(r,tmp);
    inp0org = S(inp[0], b);
    tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
    tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
-	inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
+    in1 =    (inp[3] & ~inp0org) | 
            (tmp1Mask & ~inp[3] & inp0org) |
            (~tmp0Mask & ~inp[3] & inp0org);
    in2 = inp[3] += ~inp0org;
    isAddition = ~tmp0Mask & tmp1Mask;
    inp[3] = isAddition ? in2 : in1;
    inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
 }
 __host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
 {
-	inp[0] ^= x;
+    inp[0] ^= x;
-	Mangle(inp);
+    Mangle(inp);
 }
 __host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
 {
-	uint32_t y = inp[0];
+    uint32_t y = inp[0];
-	Mangle(inp);
+    Mangle(inp);
-	return y;
+    return y;
 }
 __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
 {
-	uint32_t r = Squeeze(sponge);
+    uint32_t r = Squeeze(sponge);
    uint32_t t = ((r >> 8) & 0x1F);
    uint32_t y = 1 << t;
-	//uint8_t r0 = r >> 8;
+    uint32_t a = (((r>>1) & 0x01) << t) & y;
-	uint8_t r1 = r & 0xFF;
+    uint32_t b = ((r & 0x01) << t) & y;
-	uint32_t y = 1 << ((r >> 8) & 0x1F);
+    uint32_t c = x & y;
-	//uint32_t retVal;
+    uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
-	//retVal = x;
+    return retVal;
 	uint32_t resArr[4];
 	resArr[0] = x;
 	resArr[1] = x & ~y;
 	resArr[2] = x | y;
 	resArr[3] = x ^ y;
 	return resArr[r1 & 0x03];
 	/*
 	switch(r1 & 0x03)
 	{
 	case 0:
 		break;
    case 1:
        retVal = x & ~y;
 		break;
    case 2:
        retVal = x | y;
 		break;
    case 3:
        retVal = x ^ y;
 		break;
    }
 	return retVal;
 	*/
 }
 __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
 {
-	uint32_t tmpBr;
+    uint32_t tmpBr;
-	uint32_t brG = Br(sponge, regs[6]);	
+    uint32_t brG = Br(sponge, regs[6]);    
-	uint32_t brF = Br(sponge, regs[5]);
+    uint32_t brF = Br(sponge, regs[5]);
-	uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
+    uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
-	uint32_t brE = Br(sponge, regs[4]);
+    uint32_t brE = Br(sponge, regs[4]);
-	uint32_t tmp2 = tmp1 + S1(brE);
+    uint32_t tmp2 = tmp1 + S1(brE);
-	uint32_t brC = Br(sponge, regs[2]);
+    uint32_t brC = Br(sponge, regs[2]);
-	uint32_t brB = Br(sponge, regs[1]);
+    uint32_t brB = Br(sponge, regs[1]);
-	uint32_t brA = Br(sponge, regs[0]);
+    uint32_t brA = Br(sponge, regs[0]);
-	uint32_t tmp3 = Maj(brA, brB, brC);
+    uint32_t tmp3 = Maj(brA, brB, brC);
-	tmpBr = Br(sponge, regs[0]);
+    tmpBr = Br(sponge, regs[0]);
-	uint32_t tmp4 = tmp3 + S0(tmpBr);
+    uint32_t tmp4 = tmp3 + S0(tmpBr);
-	tmpBr = Br(sponge, tmp2);
+    tmpBr = Br(sponge, tmp2);
-	#pragma unroll 7
+    #pragma unroll 7
-	for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+    for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-	regs[0] = tmp2 + tmp4;
+    regs[0] = tmp2 + tmp4;
-	regs[4] += tmpBr;
+    regs[4] += tmpBr;
 }
 __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
 {
-	uint32_t tmpBr;
+    uint32_t tmpBr;
-	uint32_t brG = Br(sponge, regs[6]);	
+    uint32_t brG = Br(sponge, regs[6]);    
-	uint32_t brF = Br(sponge, regs[5]);
+    uint32_t brF = Br(sponge, regs[5]);
-	uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
+    uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
-	uint32_t brE = Br(sponge, regs[4]);
+    uint32_t brE = Br(sponge, regs[4]);
-	uint32_t tmp2 = tmp1 + S1(brE);
+    uint32_t tmp2 = tmp1 + S1(brE);
-	uint32_t brC = Br(sponge, regs[2]);
+    uint32_t brC = Br(sponge, regs[2]);
-	uint32_t brB = Br(sponge, regs[1]);
+    uint32_t brB = Br(sponge, regs[1]);
-	uint32_t brA = Br(sponge, regs[0]);
+    uint32_t brA = Br(sponge, regs[0]);
-	uint32_t tmp3 = Maj(brA, brB, brC);
+    uint32_t tmp3 = Maj(brA, brB, brC);
-	tmpBr = Br(sponge, regs[0]);
+    tmpBr = Br(sponge, regs[0]);
-	uint32_t tmp4 = tmp3 + S0(tmpBr);
+    uint32_t tmp4 = tmp3 + S0(tmpBr);
-	tmpBr = Br(sponge, tmp2);
+    tmpBr = Br(sponge, tmp2);
-	for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
+    for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
-	regs[0] = tmp2 + tmp4;
+    regs[0] = tmp2 + tmp4;
-	regs[4] += tmpBr;
+    regs[4] += tmpBr;
 }
 // Die Hash-Funktion
 __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
 {
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+    #if USE_SHARED
-	if (thread < threads)
+    extern __shared__ char heftytab[];
-	{
+    if(threadIdx.x < 64)
-		// bestimme den aktuellen Zähler
+    {
-		uint32_t nounce = startNounce + thread;
+        *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
-	
+    }
-		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
+
-#if USE_SHARED
+    __syncthreads();
 		extern __shared__ unsigned char s[];
 		uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
 #else
 		// reduktion von 256 byte auf 128 byte
 		uint32_t W1[16];
 		uint32_t W2[16];
 #endif
-		// Initialisiere die register a bis h mit der Hash-Tabelle
+    int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-		uint32_t regs[8];
+    if (thread < threads)
-		uint32_t hash[8];
+    {
-		uint32_t sponge[4];
+        // bestimme den aktuellen Zähler
-	
+        uint32_t nounce = startNounce + thread;
        // jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
        // reduktion von 256 byte auf 128 byte
        uint32_t W1[16];
        uint32_t W2[16];
        // Initialisiere die register a bis h mit der Hash-Tabelle
        uint32_t regs[8];
        uint32_t hash[8];
        uint32_t sponge[4];
 #pragma unroll 4
-		for(int k=0; k < 4; k++)
+        for(int k=0; k < 4; k++)
-			sponge[k] = hefty_gpu_sponge[k];
+            sponge[k] = hefty_gpu_sponge[k];
-		// pre
+        // pre
 #pragma unroll 8
-		for (int k=0; k < 8; k++)
+        for (int k=0; k < 8; k++)
-		{
+        {
-			regs[k] = hefty_gpu_register[k];
+            regs[k] = hefty_gpu_register[k];
-			hash[k] = regs[k];
+            hash[k] = regs[k];
-		}
+        }
-	
+    
-		//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
+        //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
 #pragma unroll 16
-		for(int k=0;k<16;k++)
+        for(int k=0;k<16;k++)
-			W1[k] = hefty_gpu_blockHeader[k];
+            W1[k] = hefty_gpu_blockHeader[k];
-		W1[3] = SWAB32(nounce);
+        W1[3] = SWAB32(nounce);
-
+        // 2. Runde
 		// 2. Runde
 #pragma unroll 16
-		for(int j=0;j<16;j++)
+        for(int j=0;j<16;j++)
-			Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]);
+            Absorb(sponge, W1[j] ^ heftyLookUp(j));
 // Progress W1 (Bytes 0...63)
 #pragma unroll 16
-		for(int j=0;j<16;j++)
+        for(int j=0;j<16;j++)
-		{
+        {
-			Absorb(sponge, regs[3] ^ regs[7]);
+            Absorb(sponge, regs[3] ^ regs[7]);
-			hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge);
+            hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
-		}
+        }
 // Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
-		
+        
 #pragma unroll 3
-		for(int k=0;k<3;k++)
+        for(int k=0;k<3;k++)
-		{
+        {
-	#pragma unroll 2
+    #pragma unroll 2
-			for(int j=0;j<2;j++)
+            for(int j=0;j<2;j++)
-				W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+                W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-	#pragma unroll 5
+    #pragma unroll 5
-			for(int j=2;j<7;j++)
+            for(int j=2;j<7;j++)
-				W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
+                W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
-	#pragma unroll 8
+    #pragma unroll 8
-			for(int j=7;j<15;j++)
+            for(int j=7;j<15;j++)
-				W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
+                W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
-			W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
+            W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
-	#pragma unroll 16
+    #pragma unroll 16
-			for(int j=0;j<16;j++)
+            for(int j=0;j<16;j++)
-			{
+            {
-				Absorb(sponge, regs[3] + regs[7]);
+                Absorb(sponge, regs[3] + regs[7]);
-				hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge);
+                hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
-			}
+            }
-	#pragma unroll 16
+    #pragma unroll 16
-			for(int j=0;j<16;j++)
+            for(int j=0;j<16;j++)
-				W1[j] = W2[j];
+                W1[j] = W2[j];
-		}
+        }
-		
+        
 #pragma unroll 8
        for(int k=0;k<8;k++)
            hash[k] += regs[k];
 #pragma unroll 8
-		for(int k=0;k<8;k++)
+        for(int k=0;k<8;k++)
-			hash[k] += regs[k];
+            ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
-
+    }
 #pragma unroll 8
 		for(int k=0;k<8;k++)
 			((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
 	}
 }
 // Setup-Funktionen
 __host__ void hefty_cpu_init(int thr_id, int threads)
 {
-	cudaSetDevice(thr_id);
+    cudaSetDevice(device_map[thr_id]);
-	// Kopiere die Hash-Tabellen in den GPU-Speicher
+    cudaGetDeviceProperties(&props, device_map[thr_id]);
 	cudaMemcpyToSymbol(	hefty_gpu_constantTable,
 						hefty_cpu_constantTable,
 						sizeof(uint32_t) * 64 );
-	// Speicher für alle Hefty1 hashes belegen
+    // Kopiere die Hash-Tabellen in den GPU-Speicher
-	cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
+    cudaMemcpyToSymbol(    hefty_gpu_constantTable,
                        hefty_cpu_constantTable,
                        sizeof(uint32_t) * 64 );
    // Speicher für alle Hefty1 hashes belegen
    cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
 }
 __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
-	// data muss 84-Byte haben!
+    // data muss 84-Byte haben!
 {
-	// Nachricht expandieren und setzen
+    // Nachricht expandieren und setzen
-	uint32_t msgBlock[32];
+    uint32_t msgBlock[32];
-	memset(msgBlock, 0, sizeof(uint32_t) * 32);
+    memset(msgBlock, 0, sizeof(uint32_t) * 32);
-	memcpy(&msgBlock[0], data, 84);
+    memcpy(&msgBlock[0], data, 84);
-	msgBlock[21] |= 0x80;
+    msgBlock[21] |= 0x80;
-	msgBlock[31] = 672; // bitlen
+    msgBlock[31] = 672; // bitlen
-	
+    
-	for(int i=0;i<31;i++) // Byteorder drehen
+    for(int i=0;i<31;i++) // Byteorder drehen
-		msgBlock[i] = SWAB32(msgBlock[i]);
+        msgBlock[i] = SWAB32(msgBlock[i]);
-	// die erste Runde wird auf der CPU durchgeführt, da diese für
+    // die erste Runde wird auf der CPU durchgeführt, da diese für
-	// alle Threads gleich ist. Der Hash wird dann an die Threads
+    // alle Threads gleich ist. Der Hash wird dann an die Threads
-	// übergeben
+    // übergeben
-	// Erstelle expandierten Block W
+    // Erstelle expandierten Block W
-	uint32_t W[64];	
+    uint32_t W[64];    
-	memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);	
+    memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);    
-	for(int j=16;j<64;j++)
+    for(int j=16;j<64;j++)
-		W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
+        W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
-	// Initialisiere die register a bis h mit der Hash-Tabelle
+    // Initialisiere die register a bis h mit der Hash-Tabelle
-	uint32_t regs[8];
+    uint32_t regs[8];
-	uint32_t hash[8];
+    uint32_t hash[8];
-	uint32_t sponge[4];
+    uint32_t sponge[4];
-	// pre
+    // pre
-	memset(sponge, 0, sizeof(uint32_t) * 4);
+    memset(sponge, 0, sizeof(uint32_t) * 4);
    for (int k=0; k < 8; k++)
-	{
+    {
-		regs[k] = hefty_cpu_hashTable[k];
+        regs[k] = hefty_cpu_hashTable[k];
-		hash[k] = regs[k];
+        hash[k] = regs[k];
-	}	
+    }    
-	// 1. Runde
+    // 1. Runde
-	for(int j=0;j<16;j++)
+    for(int j=0;j<16;j++)
-		Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
+        Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
-	for(int j=0;j<16;j++)
+    for(int j=0;j<16;j++)
-	{
+    {
-		Absorb(sponge, regs[3] ^ regs[7]);
+        Absorb(sponge, regs[3] ^ regs[7]);
-		hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
+        hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
-	}
+    }
-	for(int j=16;j<64;j++)
+    for(int j=16;j<64;j++)
-	{
+    {
-		Absorb(sponge, regs[3] + regs[7]);
+        Absorb(sponge, regs[3] + regs[7]);
-		hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
+        hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
-	}
+    }
-	for(int k=0;k<8;k++)
+    for(int k=0;k<8;k++)
-		hash[k] += regs[k];
+        hash[k] += regs[k];
-	// sponge speichern
+    // sponge speichern
-	cudaMemcpyToSymbol(	hefty_gpu_sponge,
+    cudaMemcpyToSymbol( hefty_gpu_sponge,
-						sponge,
+                        sponge,
-						sizeof(uint32_t) * 4 );
+                        sizeof(uint32_t) * 4 );
-	// hash speichern
+    // hash speichern
-	cudaMemcpyToSymbol(	hefty_gpu_register,
+    cudaMemcpyToSymbol( hefty_gpu_register,
-						hash,
+                        hash,
-						sizeof(uint32_t) * 8 );
+                        sizeof(uint32_t) * 8 );
-	// Blockheader setzen (korrekte Nonce fehlt da drin noch)
+    // Blockheader setzen (korrekte Nonce fehlt da drin noch)
-	cudaMemcpyToSymbol(	hefty_gpu_blockHeader,
+    cudaMemcpyToSymbol( hefty_gpu_blockHeader,
-						&msgBlock[16],
+                        &msgBlock[16],
-						64);
+                        64);
 }
 __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
 {
-	const int threadsperblock = 128;
+    // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
    // alle anderen mit 512 Threads.
    int threadsperblock = (props.major >= 3) ? 768 : 512;
-	// berechne wie viele Thread Blocks wir brauchen
+    // berechne wie viele Thread Blocks wir brauchen
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
+    dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+    // Größe des dynamischen Shared Memory Bereichs
-#if USE_SHARED
+    #if USE_SHARED
-	size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingefügt gegen Bank Konflikte
+    size_t shared_size = 8 * 64 * sizeof(uint32_t);
 #else
-	size_t shared_size = 0;
+    size_t shared_size = 0;
 #endif
-//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+//    fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
-	hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
+    hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
 }
--- a/cuda_keccak512.cu
+++ b/cuda_keccak512.cu
@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/cuda_sha256.cu
+++ b/cuda_sha256.cu
@ -5,8 +5,6 @@
 #include <stdio.h>
 #include <memory.h>
 #define W_ALIGNMENT 65
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned int uint32_t;
@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
 		nonceVector[thread] = nounce;
 		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
 		//extern __shared__ unsigned char s[];
 		//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
 		uint32_t W1[16];
 		uint32_t W2[16];
@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
 __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
 {
-	const int threadsperblock = 128;
+	const int threadsperblock = 256;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+	// Größe des dynamischen Shared Memory Bereichs
 	//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingefügt gegen Bank Konflikte
 	size_t shared_size = 0;
 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
--- a/heavy.cu
+++ b/heavy.cu
@ -163,6 +163,41 @@ extern "C" int cuda_num_devices()
    return GPU_N;
 }
 static bool substringsearch(const char *haystack, const char *needle, int &match)
 {
    int hlen = strlen(haystack);
    int nlen = strlen(needle);
    for (int i=0; i < hlen; ++i)
    {
        if (haystack[i] == ' ') continue;
        int j=0, x = 0;
        while(j < nlen)
        {
            if (haystack[i+x] == ' ') {++x; continue;}
            if (needle[j] == ' ') {++j; continue;}
            if (needle[j] == '#') return ++match == needle[j+1]-'0';
            if (tolower(haystack[i+x]) != tolower(needle[j])) break;
            ++j; ++x;
        }
        if (j == nlen) return true;
    }
    return false;
 }
 // CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
 extern "C" int cuda_finddevice(char *name)
 {
    int num = cuda_num_devices();
    int match = 0;
    for (int i=0; i < num; ++i)
    {
        cudaDeviceProp props;
        if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
            if (substringsearch(props.name, name, match)) return i;
    }
    return -1;
 }
 // Zeitsynchronisations-Routine von cudaminer mit CPU sleep
 typedef struct { double value[8]; } tsumarray;
 cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)