committing changes to v0.3 release - added Groestlcoin.

2025-08-27 14:22:24 +00:00 · 2014-03-23 21:39:26 +01:00 · 2014-03-23 21:39:26 +01:00 · b93669a99f
commit b93669a99f
parent a3f4e78a20
21 changed files with 934 additions and 261 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -25,13 +25,20 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  cuda_hefty1.cu cuda_hefty1.h \
 			  cuda_keccak512.cu cuda_keccak512.h \
 			  cuda_sha256.cu cuda_sha256.h \
-			  cuda_fugue256.cu \
-			  fuguecoin.cpp fugue.c sph_fugue.h uint256.h
-			  
+			  fuguecoin.cpp cuda_fugue256.cu fugue.c sph_fugue.h uint256.h \
+			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h

 ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
 ccminer_LDADD		= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
 ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) -DSCRYPT_KECCAK512 -DSCRYPT_CHACHA -DSCRYPT_CHOOSE_COMPILETIME

 .cu.o:
-	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_35 --maxrregcount=124 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+
+## Thrust needs Compute 2.0 minimum
+#heavy.o: heavy.cu
+#	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+#
+#cuda_hefty1.o: cuda_hefty1.cu
+#	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+
--- a/Makefile.in
+++ b/Makefile.in
@ -60,8 +60,9 @@ am_ccminer_OBJECTS = ccminer-cpu-miner.$(OBJEXT) \
 	ccminer-sha2.$(OBJEXT) heavy.$(OBJEXT) cuda_blake512.$(OBJEXT) \
 	cuda_combine.$(OBJEXT) cuda_groestl512.$(OBJEXT) \
 	cuda_hefty1.$(OBJEXT) cuda_keccak512.$(OBJEXT) \
-	cuda_sha256.$(OBJEXT) cuda_fugue256.$(OBJEXT) \
-	ccminer-fuguecoin.$(OBJEXT) ccminer-fugue.$(OBJEXT)
+	cuda_sha256.$(OBJEXT) ccminer-fuguecoin.$(OBJEXT) \
+	cuda_fugue256.$(OBJEXT) ccminer-fugue.$(OBJEXT) \
+	ccminer-groestlcoin.$(OBJEXT) cuda_groestlcoin.$(OBJEXT)
 ccminer_OBJECTS = $(am_ccminer_OBJECTS)
 ccminer_DEPENDENCIES =
 ccminer_LINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(ccminer_LDFLAGS) \
@ -275,8 +276,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
 			  cuda_hefty1.cu cuda_hefty1.h \
 			  cuda_keccak512.cu cuda_keccak512.h \
 			  cuda_sha256.cu cuda_sha256.h \
-			  cuda_fugue256.cu \
-			  fuguecoin.cpp fugue.c sph_fugue.h uint256.h
+			  fuguecoin.cpp cuda_fugue256.cu fugue.c sph_fugue.h uint256.h \
+			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h

 ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
 ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
@ -387,6 +388,7 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fugue.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fuguecoin.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-groestl.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-groestlcoin.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-hefty1.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-keccak.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-scrypt.Po@am__quote@
@ -561,6 +563,20 @@ ccminer-fuguecoin.obj: fuguecoin.cpp
@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-fuguecoin.obj `if test -f 'fuguecoin.cpp'; then $(CYGPATH_W) 'fuguecoin.cpp'; else $(CYGPATH_W) '$(srcdir)/fuguecoin.cpp'; fi`

+ccminer-groestlcoin.o: groestlcoin.cpp
+@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-groestlcoin.o -MD -MP -MF $(DEPDIR)/ccminer-groestlcoin.Tpo -c -o ccminer-groestlcoin.o `test -f 'groestlcoin.cpp' || echo '$(srcdir)/'`groestlcoin.cpp
+@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/ccminer-groestlcoin.Tpo $(DEPDIR)/ccminer-groestlcoin.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='groestlcoin.cpp' object='ccminer-groestlcoin.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-groestlcoin.o `test -f 'groestlcoin.cpp' || echo '$(srcdir)/'`groestlcoin.cpp
+
+ccminer-groestlcoin.obj: groestlcoin.cpp
+@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT ccminer-groestlcoin.obj -MD -MP -MF $(DEPDIR)/ccminer-groestlcoin.Tpo -c -o ccminer-groestlcoin.obj `if test -f 'groestlcoin.cpp'; then $(CYGPATH_W) 'groestlcoin.cpp'; else $(CYGPATH_W) '$(srcdir)/groestlcoin.cpp'; fi`
+@am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/ccminer-groestlcoin.Tpo $(DEPDIR)/ccminer-groestlcoin.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='groestlcoin.cpp' object='ccminer-groestlcoin.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o ccminer-groestlcoin.obj `if test -f 'groestlcoin.cpp'; then $(CYGPATH_W) 'groestlcoin.cpp'; else $(CYGPATH_W) '$(srcdir)/groestlcoin.cpp'; fi`
+
 # This directory's subdirectories are mostly independent; you can cd
 # into them and run `make' without going through this Makefile.
 # To change the values of `make' variables: instead of editing Makefiles,
@ -1018,7 +1034,7 @@ uninstall-am: uninstall-binPROGRAMS


 .cu.o:
-	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=sm_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_35 --maxrregcount=124 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<

 #heavy.o: heavy.cu
 #	$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -arch=compute_20 --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@

-ccMiner release 0.2 (Mar 21th 2014) - Pool Mining Release
+ccMiner release 0.3 (Mar 23th 2014) - Groestlcoin Release
 -------------------------------------------------------------

 ***************************************************************
@ -36,6 +36,7 @@ its command line interface and options.
  -a, --algo=ALGO       specify the algorithm to use
                          heavy       use to mine Heavycoin
                          fugue256    use to mine Fuguecoin
+                          groestl     use to mine Groestlcoin

  -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")
  -O, --userpass=U:P    username:password pair for mining server
@ -66,24 +67,29 @@ its command line interface and options.

 Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system

-cudaminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
+ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512



 Example for Heavycoin Mining on hvc.1gh.com with a dual gpu in your system

-cudaminer.exe -t 2 -a heavy -o stratum+tcp://hvcpool.1gh.com:5333 -u <<WALLET>> -p x -v 512
+ccminer.exe -t 2 -a heavy -o stratum+tcp://hvcpool.1gh.com:5333 -u <<WALLET>> -p x -v 512



 Example for Fuguecoin solo-mining with 4 gpu's in your system and a Fuguecoin-wallet running on localhost

-cudaminer.exe -q -s 1 -t 4 -a fugue256 -o http://localhost:9089 -u <<myusername>> -p <<mypassword>>
+ccminer.exe -q -s 1 -t 4 -a fugue256 -o http://localhost:9089 -u <<myusername>> -p <<mypassword>>


 Example for Fuguecoin pool mining on dwarfpool.com with all your GPUs

-q -a fugue256 -o stratum+tcp://erebor.dwarfpool.com:3340 -u YOURWALLETADDRESS.1 -p YOUREMAILADDRESS
+ccminer.exe -q -a fugue256 -o stratum+tcp://erebor.dwarfpool.com:3340 -u YOURWALLETADDRESS.1 -p YOUREMAILADDRESS
+
+
+Example for Groestlcoin solo mining
+
+ccminer.exe -q -s 1 -a groestl -o http://127.0.0.1:1441 -u USERNAME -p PASSWORD


 For solo-mining you typically use -o 127.0.0.1:xxxx where xxxx represents
@ -101,6 +107,9 @@ from your old clunkers.

 >>> RELEASE HISTORY <<<

+  Match, 23 2014 added Groestlcoin support. stratum status unknown
+                 (the only pool is currently down for fixing issues)
+
  March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
                 on Fermi and Maxwell devices. Kepler may suffer slightly
                 (3-5%)
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -229,6 +229,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
    <ClCompile Include="fugue.c" />
    <ClCompile Include="fuguecoin.cpp" />
    <ClCompile Include="groestl.c" />
+    <ClCompile Include="groestlcoin.cpp" />
    <ClCompile Include="hefty1.c" />
    <ClCompile Include="keccak.c" />
    <ClCompile Include="scrypt.c" />
@ -256,6 +257,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
    <ClInclude Include="cuda_blake512.h" />
    <ClInclude Include="cuda_combine.h" />
    <ClInclude Include="cuda_groestl512.h" />
+    <ClInclude Include="cuda_groestlcoin.h" />
    <ClInclude Include="cuda_hefty1.h" />
    <ClInclude Include="cuda_keccak512.h" />
    <ClInclude Include="cuda_sha256.h" />
@ -274,6 +276,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
    <CudaCompile Include="cuda_combine.cu" />
    <CudaCompile Include="cuda_fugue256.cu" />
    <CudaCompile Include="cuda_groestl512.cu" />
+    <CudaCompile Include="cuda_groestlcoin.cu" />
    <CudaCompile Include="cuda_hefty1.cu" />
    <CudaCompile Include="cuda_keccak512.cu" />
    <CudaCompile Include="cuda_sha256.cu" />
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -90,6 +90,9 @@
    <ClCompile Include="fuguecoin.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="groestlcoin.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="compat.h">
@ -158,6 +161,9 @@
    <ClInclude Include="uint256.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="cuda_groestlcoin.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda_sha256.cu">
@ -184,5 +190,8 @@
    <CudaCompile Include="cuda_fugue256.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
+    <CudaCompile Include="cuda_groestlcoin.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/20
+++ b/20
@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.68 for ccminer 2014.03.21.
+# Generated by GNU Autoconf 2.68 for ccminer 2014.03.23.
 #
 #
 # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@ -557,8 +557,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='ccminer'
 PACKAGE_TARNAME='ccminer'
-PACKAGE_VERSION='2014.03.21'
-PACKAGE_STRING='ccminer 2014.03.21'
+PACKAGE_VERSION='2014.03.23'
+PACKAGE_STRING='ccminer 2014.03.23'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''

@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
  # Omit some internal or obsolete options to make the list less imposing.
  # This message is too long to be a string in the A/UX 3.1 sh.
  cat <<_ACEOF
-\`configure' configures ccminer 2014.03.21 to adapt to many kinds of systems.
+\`configure' configures ccminer 2014.03.23 to adapt to many kinds of systems.

 Usage: $0 [OPTION]... [VAR=VALUE]...

@ -1368,7 +1368,7 @@ fi

 if test -n "$ac_init_help"; then
  case $ac_init_help in
-     short | recursive ) echo "Configuration of ccminer 2014.03.21:";;
+     short | recursive ) echo "Configuration of ccminer 2014.03.23:";;
   esac
  cat <<\_ACEOF

@ -1469,7 +1469,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
  cat <<\_ACEOF
-ccminer configure 2014.03.21
+ccminer configure 2014.03.23
 generated by GNU Autoconf 2.68

 Copyright (C) 2010 Free Software Foundation, Inc.
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.

-It was created by ccminer $as_me 2014.03.21, which was
+It was created by ccminer $as_me 2014.03.23, which was
 generated by GNU Autoconf 2.68.  Invocation command line was

  $ $0 $@
@ -2901,7 +2901,7 @@ fi

 # Define the identity of the package.
 PACKAGE='ccminer'
- VERSION='2014.03.21'
+ VERSION='2014.03.23'


 cat >>confdefs.h <<_ACEOF
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by ccminer $as_me 2014.03.21, which was
+This file was extended by ccminer $as_me 2014.03.23, which was
 generated by GNU Autoconf 2.68.  Invocation command line was

  CONFIG_FILES    = $CONFIG_FILES
@ -7184,7 +7184,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-ccminer config.status 2014.03.21
+ccminer config.status 2014.03.23
 configured by $0, generated by GNU Autoconf 2.68,
  with options \\"\$ac_cs_config\\"

--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2014.03.21])
+AC_INIT([ccminer], [2014.03.23])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -117,11 +117,13 @@ struct workio_cmd {
 typedef enum {
 	ALGO_HEAVY,		/* Heavycoin hash */
 	ALGO_FUGUE256,		/* Fugue256 */
+	ALGO_GROESTL,
 } sha256_algos;

 static const char *algo_names[] = {
 	"heavy",
-	"fugue256"
+	"fugue256",
+	"groestl"
 };

 bool opt_debug = false;
@ -667,7 +669,11 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	if (opt_algo == ALGO_HEAVY)
 		heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
 	else
+	if (opt_algo == ALGO_FUGUE256)
 		SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root);
+	else
+		sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
+
 	for (i = 0; i < sctx->job.merkle_count; i++) {
 		memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
 		if (opt_algo == ALGO_HEAVY)
@ -817,7 +823,10 @@ static void *miner_thread(void *userdata)
 			rc = scanhash_fugue256(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
-
+		case ALGO_GROESTL:
+			rc = scanhash_groestlcoin(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
 		default:
 			/* should never happen */
 			goto out;
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@ -1,174 +1,167 @@
-/* cpuminer-config.h.  Generated from cpuminer-config.h.in by configure.  */
 /* cpuminer-config.h.in.  Generated from configure.ac by autoheader.  */

 /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
   systems. This function is required for `alloca.c' support on those systems.
   */
-/* #undef CRAY_STACKSEG_END */
+#undef CRAY_STACKSEG_END

 /* Define to 1 if using `alloca.c'. */
-/* #undef C_ALLOCA */
+#undef C_ALLOCA

 /* Define to 1 if you have `alloca', as a function or macro. */
-#define HAVE_ALLOCA 1
+#undef HAVE_ALLOCA

 /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
   */
-#define HAVE_ALLOCA_H 1
+#undef HAVE_ALLOCA_H

 /* Define to 1 if you have the declaration of `be32dec', and to 0 if you
   don't. */
-#define HAVE_DECL_BE32DEC 0
+#undef HAVE_DECL_BE32DEC

 /* Define to 1 if you have the declaration of `be32enc', and to 0 if you
   don't. */
-#define HAVE_DECL_BE32ENC 0
+#undef HAVE_DECL_BE32ENC

 /* Define to 1 if you have the declaration of `le32dec', and to 0 if you
   don't. */
-#define HAVE_DECL_LE32DEC 0
+#undef HAVE_DECL_LE32DEC

 /* Define to 1 if you have the declaration of `le32enc', and to 0 if you
   don't. */
-#define HAVE_DECL_LE32ENC 0
+#undef HAVE_DECL_LE32ENC

 /* Define to 1 if you have the `getopt_long' function. */
 #define HAVE_GETOPT_LONG 1

 /* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the `crypto' library (-lcrypto). */
-#define HAVE_LIBCRYPTO 1
+#undef HAVE_INTTYPES_H

 /* Define to 1 if you have a functional curl library. */
-#define HAVE_LIBCURL 1
-
-/* Define to 1 if you have the `ssl' library (-lssl). */
-#define HAVE_LIBSSL 1
+#undef HAVE_LIBCURL

 /* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
+#undef HAVE_MEMORY_H

 /* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
+#undef HAVE_STDINT_H

 /* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
+#undef HAVE_STDLIB_H

 /* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
+#undef HAVE_STRINGS_H

 /* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
+#undef HAVE_STRING_H

 /* Define to 1 if you have the <syslog.h> header file. */
-#define HAVE_SYSLOG_H 1
+#undef HAVE_SYSLOG_H

 /* Define to 1 if you have the <sys/endian.h> header file. */
-/* #undef HAVE_SYS_ENDIAN_H */
+#undef HAVE_SYS_ENDIAN_H

 /* Define to 1 if you have the <sys/param.h> header file. */
-#define HAVE_SYS_PARAM_H 1
+#undef HAVE_SYS_PARAM_H

 /* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
+#undef HAVE_SYS_STAT_H

 /* Define to 1 if you have the <sys/sysctl.h> header file. */
-#define HAVE_SYS_SYSCTL_H 1
+#undef HAVE_SYS_SYSCTL_H

 /* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
+#undef HAVE_SYS_TYPES_H

 /* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
+#undef HAVE_UNISTD_H

 /* Defined if libcurl supports AsynchDNS */
-/* #undef LIBCURL_FEATURE_ASYNCHDNS */
+#undef LIBCURL_FEATURE_ASYNCHDNS

 /* Defined if libcurl supports IDN */
-#define LIBCURL_FEATURE_IDN 1
+#undef LIBCURL_FEATURE_IDN

 /* Defined if libcurl supports IPv6 */
-#define LIBCURL_FEATURE_IPV6 1
+#undef LIBCURL_FEATURE_IPV6

 /* Defined if libcurl supports KRB4 */
-/* #undef LIBCURL_FEATURE_KRB4 */
+#undef LIBCURL_FEATURE_KRB4

 /* Defined if libcurl supports libz */
-#define LIBCURL_FEATURE_LIBZ 1
+#undef LIBCURL_FEATURE_LIBZ

 /* Defined if libcurl supports NTLM */
-#define LIBCURL_FEATURE_NTLM 1
+#undef LIBCURL_FEATURE_NTLM

 /* Defined if libcurl supports SSL */
-#define LIBCURL_FEATURE_SSL 1
+#undef LIBCURL_FEATURE_SSL

 /* Defined if libcurl supports SSPI */
-/* #undef LIBCURL_FEATURE_SSPI */
+#undef LIBCURL_FEATURE_SSPI

 /* Defined if libcurl supports DICT */
-#define LIBCURL_PROTOCOL_DICT 1
+#undef LIBCURL_PROTOCOL_DICT

 /* Defined if libcurl supports FILE */
-#define LIBCURL_PROTOCOL_FILE 1
+#undef LIBCURL_PROTOCOL_FILE

 /* Defined if libcurl supports FTP */
-#define LIBCURL_PROTOCOL_FTP 1
+#undef LIBCURL_PROTOCOL_FTP

 /* Defined if libcurl supports FTPS */
-#define LIBCURL_PROTOCOL_FTPS 1
+#undef LIBCURL_PROTOCOL_FTPS

 /* Defined if libcurl supports HTTP */
-#define LIBCURL_PROTOCOL_HTTP 1
+#undef LIBCURL_PROTOCOL_HTTP

 /* Defined if libcurl supports HTTPS */
-#define LIBCURL_PROTOCOL_HTTPS 1
+#undef LIBCURL_PROTOCOL_HTTPS

 /* Defined if libcurl supports IMAP */
-#define LIBCURL_PROTOCOL_IMAP 1
+#undef LIBCURL_PROTOCOL_IMAP

 /* Defined if libcurl supports LDAP */
-#define LIBCURL_PROTOCOL_LDAP 1
+#undef LIBCURL_PROTOCOL_LDAP

 /* Defined if libcurl supports POP3 */
-#define LIBCURL_PROTOCOL_POP3 1
+#undef LIBCURL_PROTOCOL_POP3

 /* Defined if libcurl supports RTSP */
-#define LIBCURL_PROTOCOL_RTSP 1
+#undef LIBCURL_PROTOCOL_RTSP

 /* Defined if libcurl supports SMTP */
-#define LIBCURL_PROTOCOL_SMTP 1
+#undef LIBCURL_PROTOCOL_SMTP

 /* Defined if libcurl supports TELNET */
-#define LIBCURL_PROTOCOL_TELNET 1
+#undef LIBCURL_PROTOCOL_TELNET

 /* Defined if libcurl supports TFTP */
-#define LIBCURL_PROTOCOL_TFTP 1
+#undef LIBCURL_PROTOCOL_TFTP

 /* Define to 1 if your C compiler doesn't accept -c and -o together. */
-/* #undef NO_MINUS_C_MINUS_O */
+#undef NO_MINUS_C_MINUS_O

 /* Name of package */
-#define PACKAGE "ccminer"
+#undef PACKAGE

 /* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
+#undef PACKAGE_BUGREPORT

 /* Define to the full name of this package. */
 #define PACKAGE_NAME "ccminer"

 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 2014.03.21"
+#define PACKAGE_STRING "ccminer 2014.03.23"

 /* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "ccminer"
+#undef PACKAGE_TARNAME

 /* Define to the home page for this package. */
-#define PACKAGE_URL ""
+#undef PACKAGE_URL

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2014.03.21"
+#define PACKAGE_VERSION "2014.03.23"

 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
@ -176,25 +169,22 @@
 	STACK_DIRECTION > 0 => grows toward higher addresses
 	STACK_DIRECTION < 0 => grows toward lower addresses
 	STACK_DIRECTION = 0 => direction of growth unknown */
-/* #undef STACK_DIRECTION */
+#undef STACK_DIRECTION

 /* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
+#undef STDC_HEADERS

 /* Define to 1 if AVX assembly is available. */
-#define USE_AVX 1
-
-/* Define to 1 if AVX2 assembly is available. */
-#define USE_AVX2 1
+#undef USE_AVX

 /* Define to 1 if XOP assembly is available. */
-#define USE_XOP 1
+#undef USE_XOP

 /* Version number of package */
-#define VERSION "2014.03.21"
+#undef VERSION

 /* Define curl_free() as free() if our version of curl lacks curl_free. */
-/* #undef curl_free */
+#undef curl_free

 /* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
+#undef size_t
--- a/cuda_blake512.cu
+++ b/cuda_blake512.cu
@ -1,4 +1,3 @@
-/* Diese Funktion ist auf 84+32-Byte große Eingabedaten ausgerichtet (Heavycoin) */
 #include <cuda.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
--- a/cuda_combine.cu
+++ b/cuda_combine.cu
@ -1,4 +1,3 @@
-/* Diese Funktion ist auf 84+32 Byte große Eingabedaten ausgerichtet (Heavycoin) */
 #include <cuda.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
--- a/cuda_fugue256.cu
+++ b/cuda_fugue256.cu
@ -1,5 +1,3 @@
-#if 1
-/* Diese Funktion ist auf 84+32 Byte große Eingabedaten ausgerichtet (Heavycoin) */
 #include <cuda.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
@ -571,6 +569,8 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 	*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(mixTab1Tex, threadIdx.x);
 	*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(mixTab2Tex, threadIdx.x);
 	*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(mixTab3Tex, threadIdx.x);
+
+	__syncthreads();
 #endif

 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
@ -788,5 +788,3 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
 	//cudaMemcpy(outputHashes, d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 }
-
-#endif
--- a/cuda_groestl512.cu
+++ b/cuda_groestl512.cu
@ -1,4 +1,3 @@
-/* Diese Funktion ist auf 84+32-Byte große Eingabedaten ausgerichtet (Heavycoin) */
 #include <cuda.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
@ -6,9 +5,6 @@
 #include <stdio.h>
 #include <memory.h>

-#define USE_SHARED 0
-#define W_ALIGNMENT 65
-
 // Folgende Definitionen später durch header ersetzen
 typedef unsigned char uint8_t;
 typedef unsigned int uint32_t;
@ -60,7 +56,7 @@ texture<unsigned int, 1, cudaReadModeElementType> t2dn;
 texture<unsigned int, 1, cudaReadModeElementType> t3up;
 texture<unsigned int, 1, cudaReadModeElementType> t3dn;

-static const uint32_t T0up_cpu[] = {
+uint32_t T0up_cpu[] = {
 	C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d),
 	C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54),
 	C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d),
@ -127,7 +123,7 @@ static const uint32_t T0up_cpu[] = {
 	C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a)
 };

-static const uint32_t T0dn_cpu[] = {
+uint32_t T0dn_cpu[] = {
 	C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6),
 	C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491),
 	C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56),
@ -194,7 +190,7 @@ static const uint32_t T0dn_cpu[] = {
 	C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c)
 };

-static const uint32_t T1up_cpu[] = {
+uint32_t T1up_cpu[] = {
 	C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c),
 	C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc),
 	C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187),
@ -261,7 +257,7 @@ static const uint32_t T1up_cpu[] = {
 	C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e)
 };

-static const uint32_t T1dn_cpu[] = {
+uint32_t T1dn_cpu[] = {
 	C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d),
 	C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954),
 	C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d),
@ -328,7 +324,7 @@ static const uint32_t T1dn_cpu[] = {
 	C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a)
 };

-static const uint32_t T2up_cpu[] = {
+uint32_t T2up_cpu[] = {
 	C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a),
 	C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d),
 	C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1),
@ -395,7 +391,7 @@ static const uint32_t T2up_cpu[] = {
 	C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62)
 };

-static const uint32_t T2dn_cpu[] = {
+uint32_t T2dn_cpu[] = {
 	C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7),
 	C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39),
 	C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac),
@ -462,7 +458,7 @@ static const uint32_t T2dn_cpu[] = {
 	C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58)
 };

-static const uint32_t T3up_cpu[] = {
+uint32_t T3up_cpu[] = {
 	C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6),
 	C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191),
 	C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656),
@ -529,7 +525,7 @@ static const uint32_t T3up_cpu[] = {
 	C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c)
 };

-static const uint32_t T3dn_cpu[] = {
+uint32_t T3dn_cpu[] = {
 	C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c),
 	C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc),
 	C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87),
@ -685,15 +681,8 @@ __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *out
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-#if USE_SHARED
-		extern __shared__ unsigned char s[];
-		uint32_t offset = W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x;
-		uint32_t *message = (uint32_t*)(&s[offset + 0]); // 128 Byte
-		uint32_t *state = (uint32_t*)(&s[offset + 128]); // 128 Byte
-#else
 		uint32_t message[32];
 		uint32_t state[32];
-#endif

 		// lese message ein & verknüpfe diese mit dem hash1 von hefty1
 		// lese den state ein
@ -825,11 +814,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
 	dim3 block(threadsperblock);

 	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
-#if USE_SHARED
-	size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock;  // ein uint32_t eingefügt gegen Bank Konflikte
-#else
 	size_t shared_size = 0;
-#endif

 //	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@ -0,0 +1,463 @@
+// Auf Groestlcoin spezialisierte Version von Groestl
+
+#include <cuda.h>
+#include "cuda_runtime.h"
+#include "device_launch_parameters.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+#define USE_SHARED 1
+
+extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
+
+// Folgende Definitionen später durch header ersetzen
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+// globaler Speicher für alle HeftyHashes aller Threads
+__constant__ uint32_t pTarget[8]; // Single GPU
+extern uint32_t *d_resultNonce[8];
+
+// globaler Speicher für unsere Ergebnisse
+uint32_t *d_hashGROESTLCOINoutput[8];
+
+__constant__ uint32_t groestlcoin_gpu_state[32];
+__constant__ uint32_t groestlcoin_gpu_msg[32];
+__constant__ uint32_t sha256coin_gpu_constantTable[64];
+__constant__ uint32_t sha256coin_gpu_register[8];
+
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+
+#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   0xFFFFFFFF
+#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+
+#define B32_0(x)    ((x) & 0xFF)
+#define B32_1(x)    (((x) >> 8) & 0xFF)
+#define B32_2(x)    (((x) >> 16) & 0xFF)
+#define B32_3(x)    ((x) >> 24)
+
+#define SPH_C32(x)	((uint32_t)(x ## U))
+#define C32e(x)     ((SPH_C32(x) >> 24) \
+                    | ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+                    | ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+                    | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+#if USE_SHARED
+#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
+#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
+#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
+#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
+#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
+#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
+#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
+#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
+#else
+#define T0up(x) tex1Dfetch(t0up1, x)
+#define T0dn(x) tex1Dfetch(t0dn1, x)
+#define T1up(x) tex1Dfetch(t1up1, x)
+#define T1dn(x) tex1Dfetch(t1dn1, x)
+#define T2up(x) tex1Dfetch(t2up1, x)
+#define T2dn(x) tex1Dfetch(t2dn1, x)
+#define T3up(x) tex1Dfetch(t3up1, x)
+#define T3dn(x) tex1Dfetch(t3dn1, x)
+#endif
+texture<unsigned int, 1, cudaReadModeElementType> t0up1;
+texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t1up1;
+texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t2up1;
+texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
+texture<unsigned int, 1, cudaReadModeElementType> t3up1;
+texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
+
+extern uint32_t T0up_cpu[];
+extern uint32_t T0dn_cpu[];
+extern uint32_t T1up_cpu[];
+extern uint32_t T1dn_cpu[];
+extern uint32_t T2up_cpu[];
+extern uint32_t T2dn_cpu[];
+extern uint32_t T3up_cpu[];
+extern uint32_t T3dn_cpu[];
+extern uint32_t sha256_cpu_hashTable[];
+extern uint32_t sha256_cpu_constantTable[];
+
+#define S(x, n)			(((x) >> (n)) | ((x) << (32 - (n))))
+#define R(x, n)			((x) >> (n))
+#define Ch(x, y, z)		((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define S0(x)			(S(x, 2) ^ S(x, 13) ^ S(x, 22))
+#define S1(x)			(S(x, 6) ^ S(x, 11) ^ S(x, 25))
+#define s0(x)			(S(x, 7) ^ S(x, 18) ^ R(x, 3))
+#define s1(x)			(S(x, 17) ^ S(x, 19) ^ R(x, 10))
+
+#define SWAB32(x)		( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
+
+
+__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
+{
+	uint32_t t[32];
+
+//#pragma unroll 14
+	for(int r=0;r<14;r++)
+	{
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+		{
+			a[(k*2)+0] ^= PC32up(k * 0x10, r);
+			//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
+		}
+
+		// RBTT
+#pragma unroll 16
+		for(int k=0;k<32;k+=2)
+		{
+			t[k + 0] =	T0up( B32_0(a[k & 0x1f]) ) ^ 
+						T1up( B32_1(a[(k + 2) & 0x1f]) ) ^ 
+						T2up( B32_2(a[(k + 4) & 0x1f]) ) ^ 
+						T3up( B32_3(a[(k + 6) & 0x1f]) ) ^ 
+						T0dn( B32_0(a[(k + 9) & 0x1f]) ) ^ 
+						T1dn( B32_1(a[(k + 11) & 0x1f]) ) ^ 
+						T2dn( B32_2(a[(k + 13) & 0x1f]) ) ^ 
+						T3dn( B32_3(a[(k + 23) & 0x1f]) );
+
+			t[k + 1] =	T0dn( B32_0(a[k & 0x1f]) ) ^ 
+						T1dn( B32_1(a[(k + 2) & 0x1f]) ) ^ 
+						T2dn( B32_2(a[(k + 4) & 0x1f]) ) ^ 
+						T3dn( B32_3(a[(k + 6) & 0x1f]) ) ^ 
+						T0up( B32_0(a[(k + 9) & 0x1f]) ) ^ 
+						T1up( B32_1(a[(k + 11) & 0x1f]) ) ^ 
+						T2up( B32_2(a[(k + 13) & 0x1f]) ) ^ 
+						T3up( B32_3(a[(k + 23) & 0x1f]) );
+		}
+#pragma unroll 32
+		for(int k=0;k<32;k++)
+			a[k] = t[k];
+	}
+}
+
+__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
+{	
+//#pragma unroll 14
+	for(int r=0;r<14;r++)
+	{
+		uint32_t t[32];
+
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+		{
+			a[(k*2)+0] ^= QC32up(k * 0x10, r);
+			a[(k*2)+1] ^= QC32dn(k * 0x10, r);
+		}
+
+		// RBTT
+#pragma unroll 16
+		for(int k=0;k<32;k+=2)
+		{
+			t[k + 0] =	T0up( B32_0(a[(k + 2) & 0x1f]) ) ^ 
+						T1up( B32_1(a[(k + 6) & 0x1f]) ) ^ 
+						T2up( B32_2(a[(k + 10) & 0x1f]) ) ^ 
+						T3up( B32_3(a[(k + 22) & 0x1f]) ) ^ 
+						T0dn( B32_0(a[(k + 1) & 0x1f]) ) ^ 
+						T1dn( B32_1(a[(k + 5) & 0x1f]) ) ^ 
+						T2dn( B32_2(a[(k + 9) & 0x1f]) ) ^ 
+						T3dn( B32_3(a[(k + 13) & 0x1f]) );
+
+			t[k + 1] =	T0dn( B32_0(a[(k + 2) & 0x1f]) ) ^ 
+						T1dn( B32_1(a[(k + 6) & 0x1f]) ) ^ 
+						T2dn( B32_2(a[(k + 10) & 0x1f]) ) ^ 
+						T3dn( B32_3(a[(k + 22) & 0x1f]) ) ^ 
+						T0up( B32_0(a[(k + 1) & 0x1f]) ) ^ 
+						T1up( B32_1(a[(k + 5) & 0x1f]) ) ^ 
+						T2up( B32_2(a[(k + 9) & 0x1f]) ) ^ 
+						T3up( B32_3(a[(k + 13) & 0x1f]) );
+		}
+#pragma unroll 32
+		for(int k=0;k<32;k++)
+			a[k] = t[k];
+	}
+}
+#if USE_SHARED
+__global__ void  __launch_bounds__(256) 
+#else
+__global__ void 
+#endif
+
+ groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+{
+#if USE_SHARED
+	extern __shared__ char mixtabs[];
+
+	*((uint32_t*)mixtabs + (    threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
+	*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
+	*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
+	*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
+	*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
+	*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
+	*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
+	*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
+
+	__syncthreads();
+#endif
+
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+	/////
+	///// Lieber groestl, mach, dass es abgeht!!!
+	/////
+		// GROESTL
+		uint32_t message[32];
+		uint32_t state[32];
+
+		// SHA
+		// jeder thread in diesem  Block bekommt sein eigenes W Array im Shared memory
+		uint32_t g[32];
+
+
+#pragma unroll 32
+		for(int k=0;k<32;k++)
+		{
+			state[k] = groestlcoin_gpu_state[k];
+			message[k] = groestlcoin_gpu_msg[k];
+		}
+
+		uint32_t nounce = startNounce + thread;
+		message[19] = SWAB32(nounce);
+
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+			g[u] = message[u] ^ state[u];
+
+		// Perm
+#if USE_SHARED
+		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_Q(message, mixtabs);
+#else
+		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_Q(message, NULL);
+#endif
+		
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+		{
+			state[u] ^= g[u] ^ message[u];
+			g[u] = state[u];
+		}
+
+#if USE_SHARED
+		groestlcoin_perm_P(g, mixtabs);
+#else
+		groestlcoin_perm_P(g, NULL);
+#endif
+
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+			state[u] ^= g[u];
+
+		////
+		//// 2. Runde groestl
+		////
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+			message[k] = state[k + 16];
+
+#pragma unroll 32
+		for(int k=0;k<32;k++)
+			state[k] = groestlcoin_gpu_state[k];
+
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+			message[k+16] = 0;
+
+		message[16] = 0x80;		
+		message[31] = 0x01000000;
+
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+			g[u] = message[u] ^ state[u];
+
+		// Perm
+#if USE_SHARED
+		groestlcoin_perm_P(g, mixtabs);
+		groestlcoin_perm_Q(message, mixtabs);
+#else
+		groestlcoin_perm_P(g, NULL);
+		groestlcoin_perm_Q(message, NULL);
+#endif
+		
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+		{
+			state[u] ^= g[u] ^ message[u];
+			g[u] = state[u];
+		}
+
+#if USE_SHARED
+		groestlcoin_perm_P(g, mixtabs);
+#else
+		groestlcoin_perm_P(g, NULL);
+#endif
+
+#pragma unroll 32
+		for(int u=0;u<32;u++)
+			state[u] ^= g[u];
+		
+/*
+	#pragma unroll 8
+		for(int k=0;k<8;k++)
+			hash[k] = state[k+16];
+*/
+
+		// kopiere Ergebnis
+		/*
+#pragma unroll 16
+		for(int k=0;k<16;k++)
+			((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
+			*/
+		int i;
+		bool rc = true;
+	
+		for (i = 7; i >= 0; i--) {
+			if (state[i+16] > pTarget[i]) {
+				rc = false;
+				break;
+			}
+			if (state[i+16] < pTarget[i]) {
+				rc = true;
+				break;
+			}
+		}
+
+		if(rc == true)
+		{
+			if(resNounce[0] > nounce)
+			{
+				resNounce[0] = nounce;
+				/*
+				#pragma unroll 8
+				for(int k=0;k<8;k++)					
+					((uint32_t*)outputHash)[k] = (hash[k]);
+				*/
+			}
+		}
+
+	}
+}
+
+#define texDef(texname, texmem, texsource, texsize) \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+
+// Setup-Funktionen
+__host__ void groestlcoin_cpu_init(int thr_id, int threads)
+{
+	cudaSetDevice(thr_id);
+	cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
+// Texturen mit obigem Makro initialisieren
+	texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
+	texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
+	texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
+	texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
+	texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
+	texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
+	texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
+	texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
+
+	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	cudaMemcpyToSymbol(	sha256coin_gpu_constantTable,
+						sha256_cpu_constantTable,
+						sizeof(uint32_t) * 64 );
+
+	// Startvektor
+	cudaMemcpyToSymbol(	sha256coin_gpu_register,
+						sha256_cpu_hashTable,
+						sizeof(uint32_t) * 8 );
+
+	// setze register 
+	uint32_t groestl_state_init[32];
+	memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
+	groestl_state_init[31] = 0x20000;
+
+	// state speichern
+	cudaMemcpyToSymbol(	groestlcoin_gpu_state,
+						groestl_state_init,
+						128);
+
+	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+
+	// Speicher für alle Ergebnisse belegen (nur für Debug)
+	cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
+}
+
+__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+{
+	// Nachricht expandieren und setzen
+	uint32_t msgBlock[32];
+
+	memset(msgBlock, 0, sizeof(uint32_t) * 32);
+	memcpy(&msgBlock[0], data, 80);
+
+	// Erweitere die Nachricht auf den Nachrichtenblock (padding)
+	// Unsere Nachricht hat 80 Byte
+	msgBlock[20] = 0x80;
+	msgBlock[31] = 0x01000000;
+
+	// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
+	// auf der GPU ausgeführt)
+
+	// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
+	cudaMemcpyToSymbol(	groestlcoin_gpu_msg,
+						msgBlock,
+						128);
+
+	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+	cudaMemcpyToSymbol(	pTarget,
+						pTargetIn,
+						sizeof(uint32_t) * 8 );
+}
+
+__host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+{
+#if USE_SHARED
+	const int threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
+#else
+	const int threadsperblock = 512; // so einstellen wie gewünscht ;-)
+#endif
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
+#if USE_SHARED
+	size_t shared_size = 8 * 256 * sizeof(uint32_t);
+#else
+	size_t shared_size = 0;
+#endif
+
+//	fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
+	//fprintf(stderr, "ThrID: %d\n", thr_id);
+	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
+	groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]);
+
+	// Strategisches Sleep Kommando zur Senkung der CPU Last
+	MyStreamSynchronize(NULL, 0, thr_id);
+
+	cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	/// Debug
+	//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
+
+	// Nounce
+	//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+}
--- a/cuda_groestlcoin.h
+++ b/cuda_groestlcoin.h
@ -0,0 +1,8 @@
+#ifndef _CUDA_GROESTLCOIN_H
+#define _CUDA_GROESTLCOIN_H
+
+void groestlcoin_cpu_init(int thr_id, int threads);
+void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn);
+void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce);
+
+#endif
--- a/cuda_hefty1.cu
+++ b/cuda_hefty1.cu
@ -1,4 +1,3 @@
-/* Diese Funktion ist auf 84-Byte große Eingabedaten ausgerichtet (Heavycoin) */
 #include <cuda.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
--- a/cuda_keccak512.cu
+++ b/cuda_keccak512.cu
@ -1,4 +1,3 @@
-/* Diese Funktion ist auf 84+32-Byte große Eingabedaten ausgerichtet (Heavycoin) */
 #include <cuda.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
--- a/cuda_sha256.cu
+++ b/cuda_sha256.cu
@ -1,4 +1,3 @@
-/* Diese Funktion ist auf 84+32 Byte große Eingabedaten ausgerichtet (Heavycoin) */
 #include <cuda.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
--- a/groestl.c
+++ b/groestl.c
@ -29,7 +29,7 @@
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
-
+#include <stdio.h>
 #include <stddef.h>
 #include <string.h>

@ -2986,6 +2986,7 @@ groestl_big_close(sph_groestl_big_context *sc,
 #endif
 	}
 	memset(pad + 1, 0, pad_len - 9);
+	//fprintf(stderr, "%x\n", pad_len);
 #if SPH_64
 	sph_enc64be(pad + pad_len - 8, count);
 #else
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@ -0,0 +1,175 @@
+#include "uint256.h"
+#include "sph_groestl.h"
+
+#include "cpuminer-config.h"
+#include "miner.h"
+
+#include <string.h>
+#include <stdint.h>
+#include "cuda_groestlcoin.h"
+#include <openssl/sha.h>
+
+#define SWAP32(x) \
+    ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u)   | \
+      (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+void sha256func(unsigned char *hash, const unsigned char *data, int len)
+{
+	uint32_t S[16], T[16];
+	int i, r;
+
+	sha256_init(S);
+	for (r = len; r > -9; r -= 64) {
+		if (r < 64)
+			memset(T, 0, 64);
+		memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
+		if (r >= 0 && r < 64)
+			((unsigned char *)T)[r] = 0x80;
+		for (i = 0; i < 16; i++)
+			T[i] = be32dec(T + i);
+		if (r < 56)
+			T[15] = 8 * len;
+		sha256_transform(S, T, 0);
+	}
+	/*
+	memcpy(S + 8, sha256d_hash1 + 8, 32);
+	sha256_init(T);
+	sha256_transform(T, S, 0);
+	*/
+	for (i = 0; i < 8; i++)
+		be32enc((uint32_t *)hash + i, T[i]);
+}
+
+static void groestlhash(void *state, const void *input)
+{
+	// Tryout GPU-groestl
+
+    sph_groestl512_context     ctx_groestl[2];
+    static unsigned char pblank[1];
+	int ii;
+    uint32_t mask = 8;
+    uint32_t zero = 0;
+
+
+	//these uint512 in the c++ source of the client are backed by an array of uint32
+    uint32_t hashA[16], hashB[16];	
+
+
+    sph_groestl512_init(&ctx_groestl[0]);
+    sph_groestl512 (&ctx_groestl[0], input, 80); //6
+    sph_groestl512_close(&ctx_groestl[0], hashA); //7	
+
+	sph_groestl512_init(&ctx_groestl[1]);
+	sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
+    sph_groestl512_close(&ctx_groestl[1], hashB); //7
+
+	memcpy(state, hashB, 32);
+}
+
+
+
+extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{	
+	uint32_t start_nonce = pdata[19]++;
+	const uint32_t Htarg = ptarget[7];
+	const uint32_t throughPut = 4096 * 128;
+	//const uint32_t throughPut = 1;
+	int i;
+	uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
+
+	// init
+	static bool init[8] = { false, false, false, false, false, false, false, false };
+	if(!init[thr_id])
+	{
+		groestlcoin_cpu_init(thr_id, throughPut);
+		init[thr_id] = true;
+	}
+	
+	// Endian Drehung ist notwendig
+	//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
+	//pdata = (uint32_t*)testdata;
+	uint32_t endiandata[32];
+	for (int kk=0; kk < 32; kk++)
+		be32enc(&endiandata[kk], pdata[kk]);
+
+	// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
+	groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
+	
+	do {
+		// GPU
+		uint32_t foundNounce = 0xFFFFFFFF;
+
+		groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
+
+		/*
+		{
+			for(i=0;i<throughPut;i++)
+			{
+				uint32_t tmpHash[8];
+				endiandata[19] = SWAP32(pdata[19]);
+				groestlhash(tmpHash, endiandata);
+				
+				int ii;
+				printf("result GPU: ");
+				for (ii=0; ii < 32; ii++)
+				{
+					printf ("%.2x",((uint8_t*)&outputHash[8*i])[ii]);
+				};
+				printf ("\n");	
+		
+
+				groestlhash(tmpHash, endiandata);
+				printf("result CPU: ");
+				for (ii=0; ii < 32; ii++)
+				{
+					printf ("%.2x",((uint8_t*)tmpHash)[ii]);
+				};
+				
+				
+			}
+			exit(0);
+		}		
+		*/
+		if(foundNounce < 0xffffffff)
+		{
+			uint32_t tmpHash[8];
+			endiandata[19] = SWAP32(foundNounce);
+			groestlhash(tmpHash, endiandata);
+			if (((tmpHash[7]&0xFFFFFF00)==0) && 
+					fulltest(tmpHash, ptarget)) {
+						pdata[19] = foundNounce;
+						*hashes_done = foundNounce - start_nonce;
+						free(outputHash);
+				return true;
+			}
+
+			foundNounce = 0xffffffff;
+			/*
+			int ii;
+			printf("result GPU: ");
+			for (ii=0; ii < 32; ii++)
+			{
+				printf ("%.2x",((uint8_t*)&outputHash[0])[ii]);
+			};
+			printf ("\n");	
+			printf("result CPU: ");
+			for (ii=0; ii < 32; ii++)
+			{
+				printf ("%.2x",((uint8_t*)tmpHash)[ii]);
+			};
+			printf ("\n");	
+			*/
+		}
+
+		if (pdata[19] + throughPut < pdata[19])
+			pdata[19] = max_nonce;
+		else pdata[19] += throughPut;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	
+	*hashes_done = pdata[19] - start_nonce;
+	free(outputHash);
+	return 0;
+}
+
--- a/miner.h
+++ b/miner.h
@ -211,8 +211,13 @@ extern int scanhash_fugue256(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);

+extern int scanhash_groestlcoin(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
+extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len);

 struct thr_info {
 	int		id;