Browse Source

bump to revision v0.9 (VC++ project files not updated yet)

master
Christian Buchner 10 years ago
parent
commit
e049f32fee
  1. 9
      Makefile.am
  2. 187
      Makefile.in
  3. 8
      README.txt
  4. 20
      configure
  5. 2
      configure.ac
  6. 22
      cpu-miner.c
  7. 4
      cpuminer-config.h
  8. 8
      miner.h
  9. 292
      quark/animecoin.cu
  10. 473
      quark/cuda_bmw512.cu
  11. 714
      quark/cuda_jh512.cu
  12. 916
      quark/cuda_quark_blake512.cu
  13. 363
      quark/cuda_quark_compactionTest.cu
  14. 182
      quark/cuda_quark_keccak512.cu
  15. 900
      quark/cuda_skein512.cu
  16. 274
      quark/quarkcoin.cu

9
Makefile.am

@ -16,8 +16,8 @@ bin_PROGRAMS = ccminer
ccminer_SOURCES = elist.h miner.h compat.h \ ccminer_SOURCES = elist.h miner.h compat.h \
compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \
compat/sys/time.h compat/getopt/getopt.h \ compat/sys/time.h compat/getopt/getopt.h \
cpu-miner.c util.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \ cpu-miner.c util.c sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \
sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \ sph/bmw.h sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \
heavy/heavy.cu \ heavy/heavy.cu \
heavy/cuda_blake512.cu heavy/cuda_blake512.h \ heavy/cuda_blake512.cu heavy/cuda_blake512.h \
heavy/cuda_combine.cu heavy/cuda_combine.h \ heavy/cuda_combine.cu heavy/cuda_combine.h \
@ -30,6 +30,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
JHA/cuda_jha_compactionTest.cu quark/cuda_quark_checkhash.cu \ JHA/cuda_jha_compactionTest.cu quark/cuda_quark_checkhash.cu \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \
myriadgroestl.cpp cuda_myriadgroestl.cu myriadgroestl.cpp cuda_myriadgroestl.cu
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
@ -42,5 +44,8 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
# ABI requiring code modules # ABI requiring code modules
# this module doesn't compile with Compute 2.0 unfortunately # this module doesn't compile with Compute 2.0 unfortunately
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<

187
Makefile.in

@ -55,18 +55,26 @@ am__installdirs = "$(DESTDIR)$(bindir)"
PROGRAMS = $(bin_PROGRAMS) PROGRAMS = $(bin_PROGRAMS)
am__dirstamp = $(am__leading_dot)dirstamp am__dirstamp = $(am__leading_dot)dirstamp
am_ccminer_OBJECTS = ccminer-cpu-miner.$(OBJEXT) \ am_ccminer_OBJECTS = ccminer-cpu-miner.$(OBJEXT) \
ccminer-util.$(OBJEXT) ccminer-blake.$(OBJEXT) \ ccminer-util.$(OBJEXT) ccminer-bmw.$(OBJEXT) \
ccminer-groestl.$(OBJEXT) ccminer-keccak.$(OBJEXT) \ ccminer-blake.$(OBJEXT) ccminer-groestl.$(OBJEXT) \
ccminer-hefty1.$(OBJEXT) ccminer-scrypt.$(OBJEXT) \ ccminer-jh.$(OBJEXT) ccminer-keccak.$(OBJEXT) \
ccminer-sha2.$(OBJEXT) heavy.$(OBJEXT) cuda_blake512.$(OBJEXT) \ ccminer-skein.$(OBJEXT) ccminer-hefty1.$(OBJEXT) \
cuda_combine.$(OBJEXT) cuda_groestl512.$(OBJEXT) \ ccminer-scrypt.$(OBJEXT) ccminer-sha2.$(OBJEXT) \
cuda_hefty1.$(OBJEXT) cuda_keccak512.$(OBJEXT) \ heavy/heavy.$(OBJEXT) heavy/cuda_blake512.$(OBJEXT) \
cuda_sha256.$(OBJEXT) ccminer-fuguecoin.$(OBJEXT) \ heavy/cuda_combine.$(OBJEXT) heavy/cuda_groestl512.$(OBJEXT) \
heavy/cuda_hefty1.$(OBJEXT) heavy/cuda_keccak512.$(OBJEXT) \
heavy/cuda_sha256.$(OBJEXT) ccminer-fuguecoin.$(OBJEXT) \
cuda_fugue256.$(OBJEXT) ccminer-fugue.$(OBJEXT) \ cuda_fugue256.$(OBJEXT) ccminer-fugue.$(OBJEXT) \
ccminer-groestlcoin.$(OBJEXT) cuda_groestlcoin.$(OBJEXT) \ ccminer-groestlcoin.$(OBJEXT) cuda_groestlcoin.$(OBJEXT) \
JHA/jackpotcoin.$(OBJEXT) JHA/cuda_jha_keccak512.$(OBJEXT) \ JHA/jackpotcoin.$(OBJEXT) JHA/cuda_jha_keccak512.$(OBJEXT) \
ccminer-jh.$(OBJEXT) ccminer-skein.$(OBJEXT) \ JHA/cuda_jha_compactionTest.$(OBJEXT) \
quark/cuda_quark_checkhash.$(OBJEXT) \ quark/cuda_quark_checkhash.$(OBJEXT) \
quark/cuda_jh512.$(OBJEXT) quark/cuda_quark_blake512.$(OBJEXT) \
quark/cuda_quark_groestl512.$(OBJEXT) \
quark/cuda_skein512.$(OBJEXT) quark/cuda_bmw512.$(OBJEXT) \
quark/cuda_quark_keccak512.$(OBJEXT) quark/quarkcoin.$(OBJEXT) \
quark/animecoin.$(OBJEXT) \
quark/cuda_quark_compactionTest.$(OBJEXT) \
ccminer-myriadgroestl.$(OBJEXT) cuda_myriadgroestl.$(OBJEXT) ccminer-myriadgroestl.$(OBJEXT) cuda_myriadgroestl.$(OBJEXT)
ccminer_OBJECTS = $(am_ccminer_OBJECTS) ccminer_OBJECTS = $(am_ccminer_OBJECTS)
ccminer_DEPENDENCIES = ccminer_DEPENDENCIES =
@ -272,19 +280,22 @@ SUBDIRS = compat
ccminer_SOURCES = elist.h miner.h compat.h \ ccminer_SOURCES = elist.h miner.h compat.h \
compat/inttypes.h compat/stdbool.h compat/unistd.h \ compat/inttypes.h compat/stdbool.h compat/unistd.h \
compat/sys/time.h compat/getopt/getopt.h \ compat/sys/time.h compat/getopt/getopt.h \
cpu-miner.c util.c sph/blake.c sph/groestl.c sph/keccak.c hefty1.c scrypt.c sha2.c \ cpu-miner.c util.c sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \
sph/sph_blake.h sph/sph_groestl.h sph/sph_keccak.h sph/sph_types.h \ sph/bmw.h sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \
heavy.cu \ heavy/heavy.cu \
cuda_blake512.cu cuda_blake512.h \ heavy/cuda_blake512.cu heavy/cuda_blake512.h \
cuda_combine.cu cuda_combine.h \ heavy/cuda_combine.cu heavy/cuda_combine.h \
cuda_groestl512.cu cuda_groestl512.h \ heavy/cuda_groestl512.cu heavy/cuda_groestl512.h \
cuda_hefty1.cu cuda_hefty1.h \ heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
cuda_keccak512.cu cuda_keccak512.h \ heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
cuda_sha256.cu cuda_sha256.h \ heavy/cuda_sha256.cu heavy/cuda_sha256.h \
fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \ fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu sph/jh.c sph/skein.c \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
sph/sph_jh.h sph/sph_skein.h quark/cuda_quark_checkhash.cu \ JHA/cuda_jha_compactionTest.cu quark/cuda_quark_checkhash.cu \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \
myriadgroestl.cpp cuda_myriadgroestl.cu myriadgroestl.cpp cuda_myriadgroestl.cu
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@ ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
@ -381,6 +392,26 @@ uninstall-binPROGRAMS:
clean-binPROGRAMS: clean-binPROGRAMS:
-test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS) -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
heavy/$(am__dirstamp):
@$(MKDIR_P) heavy
@: > heavy/$(am__dirstamp)
heavy/$(DEPDIR)/$(am__dirstamp):
@$(MKDIR_P) heavy/$(DEPDIR)
@: > heavy/$(DEPDIR)/$(am__dirstamp)
heavy/heavy.$(OBJEXT): heavy/$(am__dirstamp) \
heavy/$(DEPDIR)/$(am__dirstamp)
heavy/cuda_blake512.$(OBJEXT): heavy/$(am__dirstamp) \
heavy/$(DEPDIR)/$(am__dirstamp)
heavy/cuda_combine.$(OBJEXT): heavy/$(am__dirstamp) \
heavy/$(DEPDIR)/$(am__dirstamp)
heavy/cuda_groestl512.$(OBJEXT): heavy/$(am__dirstamp) \
heavy/$(DEPDIR)/$(am__dirstamp)
heavy/cuda_hefty1.$(OBJEXT): heavy/$(am__dirstamp) \
heavy/$(DEPDIR)/$(am__dirstamp)
heavy/cuda_keccak512.$(OBJEXT): heavy/$(am__dirstamp) \
heavy/$(DEPDIR)/$(am__dirstamp)
heavy/cuda_sha256.$(OBJEXT): heavy/$(am__dirstamp) \
heavy/$(DEPDIR)/$(am__dirstamp)
JHA/$(am__dirstamp): JHA/$(am__dirstamp):
@$(MKDIR_P) JHA @$(MKDIR_P) JHA
@: > JHA/$(am__dirstamp) @: > JHA/$(am__dirstamp)
@ -391,6 +422,8 @@ JHA/jackpotcoin.$(OBJEXT): JHA/$(am__dirstamp) \
JHA/$(DEPDIR)/$(am__dirstamp) JHA/$(DEPDIR)/$(am__dirstamp)
JHA/cuda_jha_keccak512.$(OBJEXT): JHA/$(am__dirstamp) \ JHA/cuda_jha_keccak512.$(OBJEXT): JHA/$(am__dirstamp) \
JHA/$(DEPDIR)/$(am__dirstamp) JHA/$(DEPDIR)/$(am__dirstamp)
JHA/cuda_jha_compactionTest.$(OBJEXT): JHA/$(am__dirstamp) \
JHA/$(DEPDIR)/$(am__dirstamp)
quark/$(am__dirstamp): quark/$(am__dirstamp):
@$(MKDIR_P) quark @$(MKDIR_P) quark
@: > quark/$(am__dirstamp) @: > quark/$(am__dirstamp)
@ -399,20 +432,56 @@ quark/$(DEPDIR)/$(am__dirstamp):
@: > quark/$(DEPDIR)/$(am__dirstamp) @: > quark/$(DEPDIR)/$(am__dirstamp)
quark/cuda_quark_checkhash.$(OBJEXT): quark/$(am__dirstamp) \ quark/cuda_quark_checkhash.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp) quark/$(DEPDIR)/$(am__dirstamp)
quark/cuda_jh512.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
quark/cuda_quark_blake512.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
quark/cuda_quark_groestl512.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
quark/cuda_skein512.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
quark/cuda_bmw512.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
quark/cuda_quark_keccak512.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
quark/quarkcoin.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
quark/animecoin.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
quark/cuda_quark_compactionTest.$(OBJEXT): quark/$(am__dirstamp) \
quark/$(DEPDIR)/$(am__dirstamp)
ccminer$(EXEEXT): $(ccminer_OBJECTS) $(ccminer_DEPENDENCIES) $(EXTRA_ccminer_DEPENDENCIES) ccminer$(EXEEXT): $(ccminer_OBJECTS) $(ccminer_DEPENDENCIES) $(EXTRA_ccminer_DEPENDENCIES)
@rm -f ccminer$(EXEEXT) @rm -f ccminer$(EXEEXT)
$(ccminer_LINK) $(ccminer_OBJECTS) $(ccminer_LDADD) $(LIBS) $(ccminer_LINK) $(ccminer_OBJECTS) $(ccminer_LDADD) $(LIBS)
mostlyclean-compile: mostlyclean-compile:
-rm -f *.$(OBJEXT) -rm -f *.$(OBJEXT)
-rm -f JHA/cuda_jha_compactionTest.$(OBJEXT)
-rm -f JHA/cuda_jha_keccak512.$(OBJEXT) -rm -f JHA/cuda_jha_keccak512.$(OBJEXT)
-rm -f JHA/jackpotcoin.$(OBJEXT) -rm -f JHA/jackpotcoin.$(OBJEXT)
-rm -f heavy/cuda_blake512.$(OBJEXT)
-rm -f heavy/cuda_combine.$(OBJEXT)
-rm -f heavy/cuda_groestl512.$(OBJEXT)
-rm -f heavy/cuda_hefty1.$(OBJEXT)
-rm -f heavy/cuda_keccak512.$(OBJEXT)
-rm -f heavy/cuda_sha256.$(OBJEXT)
-rm -f heavy/heavy.$(OBJEXT)
-rm -f quark/animecoin.$(OBJEXT)
-rm -f quark/cuda_bmw512.$(OBJEXT)
-rm -f quark/cuda_jh512.$(OBJEXT)
-rm -f quark/cuda_quark_blake512.$(OBJEXT)
-rm -f quark/cuda_quark_checkhash.$(OBJEXT) -rm -f quark/cuda_quark_checkhash.$(OBJEXT)
-rm -f quark/cuda_quark_compactionTest.$(OBJEXT)
-rm -f quark/cuda_quark_groestl512.$(OBJEXT)
-rm -f quark/cuda_quark_keccak512.$(OBJEXT)
-rm -f quark/cuda_skein512.$(OBJEXT)
-rm -f quark/quarkcoin.$(OBJEXT)
distclean-compile: distclean-compile:
-rm -f *.tab.c -rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-blake.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-blake.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-bmw.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-cpu-miner.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-cpu-miner.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fugue.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fugue.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fuguecoin.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccminer-fuguecoin.Po@am__quote@
@ -469,6 +538,20 @@ ccminer-util.obj: util.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-util.obj `if test -f 'util.c'; then $(CYGPATH_W) 'util.c'; else $(CYGPATH_W) '$(srcdir)/util.c'; fi` @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-util.obj `if test -f 'util.c'; then $(CYGPATH_W) 'util.c'; else $(CYGPATH_W) '$(srcdir)/util.c'; fi`
ccminer-bmw.o: sph/bmw.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-bmw.o -MD -MP -MF $(DEPDIR)/ccminer-bmw.Tpo -c -o ccminer-bmw.o `test -f 'sph/bmw.c' || echo '$(srcdir)/'`sph/bmw.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-bmw.Tpo $(DEPDIR)/ccminer-bmw.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/bmw.c' object='ccminer-bmw.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-bmw.o `test -f 'sph/bmw.c' || echo '$(srcdir)/'`sph/bmw.c
ccminer-bmw.obj: sph/bmw.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-bmw.obj -MD -MP -MF $(DEPDIR)/ccminer-bmw.Tpo -c -o ccminer-bmw.obj `if test -f 'sph/bmw.c'; then $(CYGPATH_W) 'sph/bmw.c'; else $(CYGPATH_W) '$(srcdir)/sph/bmw.c'; fi`
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-bmw.Tpo $(DEPDIR)/ccminer-bmw.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/bmw.c' object='ccminer-bmw.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-bmw.obj `if test -f 'sph/bmw.c'; then $(CYGPATH_W) 'sph/bmw.c'; else $(CYGPATH_W) '$(srcdir)/sph/bmw.c'; fi`
ccminer-blake.o: sph/blake.c ccminer-blake.o: sph/blake.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.o -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.o `test -f 'sph/blake.c' || echo '$(srcdir)/'`sph/blake.c @am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-blake.o -MD -MP -MF $(DEPDIR)/ccminer-blake.Tpo -c -o ccminer-blake.o `test -f 'sph/blake.c' || echo '$(srcdir)/'`sph/blake.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-blake.Tpo $(DEPDIR)/ccminer-blake.Po @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-blake.Tpo $(DEPDIR)/ccminer-blake.Po
@ -497,6 +580,20 @@ ccminer-groestl.obj: sph/groestl.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.obj `if test -f 'sph/groestl.c'; then $(CYGPATH_W) 'sph/groestl.c'; else $(CYGPATH_W) '$(srcdir)/sph/groestl.c'; fi` @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-groestl.obj `if test -f 'sph/groestl.c'; then $(CYGPATH_W) 'sph/groestl.c'; else $(CYGPATH_W) '$(srcdir)/sph/groestl.c'; fi`
ccminer-jh.o: sph/jh.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-jh.o -MD -MP -MF $(DEPDIR)/ccminer-jh.Tpo -c -o ccminer-jh.o `test -f 'sph/jh.c' || echo '$(srcdir)/'`sph/jh.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-jh.Tpo $(DEPDIR)/ccminer-jh.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/jh.c' object='ccminer-jh.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-jh.o `test -f 'sph/jh.c' || echo '$(srcdir)/'`sph/jh.c
ccminer-jh.obj: sph/jh.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-jh.obj -MD -MP -MF $(DEPDIR)/ccminer-jh.Tpo -c -o ccminer-jh.obj `if test -f 'sph/jh.c'; then $(CYGPATH_W) 'sph/jh.c'; else $(CYGPATH_W) '$(srcdir)/sph/jh.c'; fi`
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-jh.Tpo $(DEPDIR)/ccminer-jh.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/jh.c' object='ccminer-jh.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-jh.obj `if test -f 'sph/jh.c'; then $(CYGPATH_W) 'sph/jh.c'; else $(CYGPATH_W) '$(srcdir)/sph/jh.c'; fi`
ccminer-keccak.o: sph/keccak.c ccminer-keccak.o: sph/keccak.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.o -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.o `test -f 'sph/keccak.c' || echo '$(srcdir)/'`sph/keccak.c @am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-keccak.o -MD -MP -MF $(DEPDIR)/ccminer-keccak.Tpo -c -o ccminer-keccak.o `test -f 'sph/keccak.c' || echo '$(srcdir)/'`sph/keccak.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-keccak.Tpo $(DEPDIR)/ccminer-keccak.Po
@ -511,6 +608,20 @@ ccminer-keccak.obj: sph/keccak.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.obj `if test -f 'sph/keccak.c'; then $(CYGPATH_W) 'sph/keccak.c'; else $(CYGPATH_W) '$(srcdir)/sph/keccak.c'; fi` @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-keccak.obj `if test -f 'sph/keccak.c'; then $(CYGPATH_W) 'sph/keccak.c'; else $(CYGPATH_W) '$(srcdir)/sph/keccak.c'; fi`
ccminer-skein.o: sph/skein.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-skein.o -MD -MP -MF $(DEPDIR)/ccminer-skein.Tpo -c -o ccminer-skein.o `test -f 'sph/skein.c' || echo '$(srcdir)/'`sph/skein.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-skein.Tpo $(DEPDIR)/ccminer-skein.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/skein.c' object='ccminer-skein.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-skein.o `test -f 'sph/skein.c' || echo '$(srcdir)/'`sph/skein.c
ccminer-skein.obj: sph/skein.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-skein.obj -MD -MP -MF $(DEPDIR)/ccminer-skein.Tpo -c -o ccminer-skein.obj `if test -f 'sph/skein.c'; then $(CYGPATH_W) 'sph/skein.c'; else $(CYGPATH_W) '$(srcdir)/sph/skein.c'; fi`
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-skein.Tpo $(DEPDIR)/ccminer-skein.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/skein.c' object='ccminer-skein.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-skein.obj `if test -f 'sph/skein.c'; then $(CYGPATH_W) 'sph/skein.c'; else $(CYGPATH_W) '$(srcdir)/sph/skein.c'; fi`
ccminer-hefty1.o: hefty1.c ccminer-hefty1.o: hefty1.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-hefty1.o -MD -MP -MF $(DEPDIR)/ccminer-hefty1.Tpo -c -o ccminer-hefty1.o `test -f 'hefty1.c' || echo '$(srcdir)/'`hefty1.c @am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-hefty1.o -MD -MP -MF $(DEPDIR)/ccminer-hefty1.Tpo -c -o ccminer-hefty1.o `test -f 'hefty1.c' || echo '$(srcdir)/'`hefty1.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-hefty1.Tpo $(DEPDIR)/ccminer-hefty1.Po @am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-hefty1.Tpo $(DEPDIR)/ccminer-hefty1.Po
@ -567,34 +678,6 @@ ccminer-fugue.obj: sph/fugue.c
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.obj `if test -f 'sph/fugue.c'; then $(CYGPATH_W) 'sph/fugue.c'; else $(CYGPATH_W) '$(srcdir)/sph/fugue.c'; fi` @am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-fugue.obj `if test -f 'sph/fugue.c'; then $(CYGPATH_W) 'sph/fugue.c'; else $(CYGPATH_W) '$(srcdir)/sph/fugue.c'; fi`
ccminer-jh.o: sph/jh.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-jh.o -MD -MP -MF $(DEPDIR)/ccminer-jh.Tpo -c -o ccminer-jh.o `test -f 'sph/jh.c' || echo '$(srcdir)/'`sph/jh.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-jh.Tpo $(DEPDIR)/ccminer-jh.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/jh.c' object='ccminer-jh.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-jh.o `test -f 'sph/jh.c' || echo '$(srcdir)/'`sph/jh.c
ccminer-jh.obj: sph/jh.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-jh.obj -MD -MP -MF $(DEPDIR)/ccminer-jh.Tpo -c -o ccminer-jh.obj `if test -f 'sph/jh.c'; then $(CYGPATH_W) 'sph/jh.c'; else $(CYGPATH_W) '$(srcdir)/sph/jh.c'; fi`
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-jh.Tpo $(DEPDIR)/ccminer-jh.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/jh.c' object='ccminer-jh.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-jh.obj `if test -f 'sph/jh.c'; then $(CYGPATH_W) 'sph/jh.c'; else $(CYGPATH_W) '$(srcdir)/sph/jh.c'; fi`
ccminer-skein.o: sph/skein.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-skein.o -MD -MP -MF $(DEPDIR)/ccminer-skein.Tpo -c -o ccminer-skein.o `test -f 'sph/skein.c' || echo '$(srcdir)/'`sph/skein.c
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-skein.Tpo $(DEPDIR)/ccminer-skein.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/skein.c' object='ccminer-skein.o' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-skein.o `test -f 'sph/skein.c' || echo '$(srcdir)/'`sph/skein.c
ccminer-skein.obj: sph/skein.c
@am__fastdepCC_TRUE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT ccminer-skein.obj -MD -MP -MF $(DEPDIR)/ccminer-skein.Tpo -c -o ccminer-skein.obj `if test -f 'sph/skein.c'; then $(CYGPATH_W) 'sph/skein.c'; else $(CYGPATH_W) '$(srcdir)/sph/skein.c'; fi`
@am__fastdepCC_TRUE@ $(am__mv) $(DEPDIR)/ccminer-skein.Tpo $(DEPDIR)/ccminer-skein.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ source='sph/skein.c' object='ccminer-skein.obj' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(ccminer_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o ccminer-skein.obj `if test -f 'sph/skein.c'; then $(CYGPATH_W) 'sph/skein.c'; else $(CYGPATH_W) '$(srcdir)/sph/skein.c'; fi`
.cpp.o: .cpp.o:
@am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $< @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po @am__fastdepCXX_TRUE@ $(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@ -1012,6 +1095,8 @@ distclean-generic:
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES) -test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
-rm -f JHA/$(DEPDIR)/$(am__dirstamp) -rm -f JHA/$(DEPDIR)/$(am__dirstamp)
-rm -f JHA/$(am__dirstamp) -rm -f JHA/$(am__dirstamp)
-rm -f heavy/$(DEPDIR)/$(am__dirstamp)
-rm -f heavy/$(am__dirstamp)
-rm -f quark/$(DEPDIR)/$(am__dirstamp) -rm -f quark/$(DEPDIR)/$(am__dirstamp)
-rm -f quark/$(am__dirstamp) -rm -f quark/$(am__dirstamp)
@ -1115,6 +1200,14 @@ uninstall-am: uninstall-binPROGRAMS
.cu.o: .cu.o:
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# ABI requiring code modules
# this module doesn't compile with Compute 2.0 unfortunately
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# Tell versions [3.59,3.63) of GNU make to not export all variables. # Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded. # Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT: .NOEXPORT:

8
README.txt

@ -1,5 +1,5 @@
ccMiner release 0.7 (May 01th 2014) - "Jackpot II" ccMiner release 0.9 (May 06th 2014) - "Say Hi to Quark, Anime"
------------------------------------------------------------- -------------------------------------------------------------
*************************************************************** ***************************************************************
@ -14,6 +14,8 @@ If you find this tool useful and like to support its continued
DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP
HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN
GRS donation address: FmJKJAhvyHWPeEVeLQHefr2naqgWc9ABTM GRS donation address: FmJKJAhvyHWPeEVeLQHefr2naqgWc9ABTM
JPC donation address: JYFBypVDkk583yKWY4M46TG5vXG8hfgD2U
MNC donation address: MShgNUSYwybEbXLvJUtdNg1a7rUeiNgooK
*************************************************************** ***************************************************************
>>> Introduction <<< >>> Introduction <<<
@ -39,6 +41,8 @@ its command line interface and options.
groestl use to mine Groestlcoin groestl use to mine Groestlcoin
myr-gr use to mine Myriad-Groestl myr-gr use to mine Myriad-Groestl
jackpot use to mine Jackpotcoin jackpot use to mine Jackpotcoin
quark use to mine Quarkcoin
anime use to mine Animecoin
-d, --devices gives a comma separated list of CUDA device IDs -d, --devices gives a comma separated list of CUDA device IDs
to operate on. Device IDs start counting from 0! to operate on. Device IDs start counting from 0!
@ -117,6 +121,8 @@ from your old clunkers.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
May 6th 2014 this adds the quark and animecoin algorithms.
May 3rd 2014 add the MjollnirCoin hash algorithm for the upcomin May 3rd 2014 add the MjollnirCoin hash algorithm for the upcomin
MjollnirCoin relaunch. MjollnirCoin relaunch.

20
configure vendored

@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.68 for ccminer 2014.04.27. # Generated by GNU Autoconf 2.68 for ccminer 2014.05.03.
# #
# #
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@ -557,8 +557,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='ccminer' PACKAGE_NAME='ccminer'
PACKAGE_TARNAME='ccminer' PACKAGE_TARNAME='ccminer'
PACKAGE_VERSION='2014.04.27' PACKAGE_VERSION='2014.05.03'
PACKAGE_STRING='ccminer 2014.04.27' PACKAGE_STRING='ccminer 2014.05.03'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures ccminer 2014.04.27 to adapt to many kinds of systems. \`configure' configures ccminer 2014.05.03 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1368,7 +1368,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of ccminer 2014.04.27:";; short | recursive ) echo "Configuration of ccminer 2014.05.03:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@ -1469,7 +1469,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
ccminer configure 2014.04.27 ccminer configure 2014.05.03
generated by GNU Autoconf 2.68 generated by GNU Autoconf 2.68
Copyright (C) 2010 Free Software Foundation, Inc. Copyright (C) 2010 Free Software Foundation, Inc.
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by ccminer $as_me 2014.04.27, which was It was created by ccminer $as_me 2014.05.03, which was
generated by GNU Autoconf 2.68. Invocation command line was generated by GNU Autoconf 2.68. Invocation command line was
$ $0 $@ $ $0 $@
@ -2901,7 +2901,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='ccminer' PACKAGE='ccminer'
VERSION='2014.04.27' VERSION='2014.05.03'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by ccminer $as_me 2014.04.27, which was This file was extended by ccminer $as_me 2014.05.03, which was
generated by GNU Autoconf 2.68. Invocation command line was generated by GNU Autoconf 2.68. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@ -7184,7 +7184,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
ccminer config.status 2014.04.27 ccminer config.status 2014.05.03
configured by $0, generated by GNU Autoconf 2.68, configured by $0, generated by GNU Autoconf 2.68,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

2
configure.ac

@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.05.03]) AC_INIT([ccminer], [2014.05.06])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

22
cpu-miner.c

@ -126,7 +126,9 @@ typedef enum {
ALGO_FUGUE256, /* Fugue256 */ ALGO_FUGUE256, /* Fugue256 */
ALGO_GROESTL, ALGO_GROESTL,
ALGO_MYR_GR, ALGO_MYR_GR,
ALGO_JACKPOT ALGO_JACKPOT,
ALGO_QUARK,
ALGO_ANIME
} sha256_algos; } sha256_algos;
static const char *algo_names[] = { static const char *algo_names[] = {
@ -135,7 +137,9 @@ static const char *algo_names[] = {
"fugue256", "fugue256",
"groestl", "groestl",
"myr-gr", "myr-gr",
"jackpot" "jackpot",
"quark",
"anime"
}; };
bool opt_debug = false; bool opt_debug = false;
@ -203,6 +207,8 @@ Options:\n\
groestl Groestlcoin hash\n\ groestl Groestlcoin hash\n\
myr-gr Myriad-Groestl hash\n\ myr-gr Myriad-Groestl hash\n\
jackpot Jackpot hash\n\ jackpot Jackpot hash\n\
quark Quark hash\n\
anime Animecoin hash\n\
-d, --devices takes a comma separated list of CUDA devices to use.\n\ -d, --devices takes a comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\ Device IDs start counting from 0! Alternatively takes\n\
string names of your cards like gtx780ti or gt640#2\n\ string names of your cards like gtx780ti or gt640#2\n\
@ -877,6 +883,16 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done); max_nonce, &hashes_done);
break; break;
case ALGO_QUARK:
rc = scanhash_quark(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_ANIME:
rc = scanhash_anime(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
default: default:
/* should never happen */ /* should never happen */
goto out; goto out;
@ -1429,7 +1445,7 @@ static void signal_handler(int sig)
} }
#endif #endif
#define PROGRAM_VERSION "0.8" #define PROGRAM_VERSION "0.9"
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
struct thr_info *thr; struct thr_info *thr;

4
cpuminer-config.h

@ -152,7 +152,7 @@
#define PACKAGE_NAME "ccminer" #define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */ /* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.05.03" #define PACKAGE_STRING "ccminer 2014.05.06"
/* Define to the one symbol short name of this package. */ /* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME #undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
#undef PACKAGE_URL #undef PACKAGE_URL
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "2014.05.03" #define PACKAGE_VERSION "2014.05.06"
/* If using the C implementation of alloca, define if you know the /* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be direction of stack growth for your system; otherwise it will be

8
miner.h

@ -223,6 +223,14 @@ extern int scanhash_jackpot(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done); unsigned long *hashes_done);
extern int scanhash_quark(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_anime(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len); extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len); extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len);

292
quark/animecoin.cu

@ -0,0 +1,292 @@
extern "C"
{
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_animeNonces[8];
static uint32_t *d_branch1Nonces[8];
static uint32_t *d_branch2Nonces[8];
static uint32_t *d_branch3Nonces[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_setBlock_80(void *pdata);
extern void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order);
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
int order);
// Original Quarkhash Funktion aus einem miner Quelltext
inline void animehash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
unsigned char hash[64];
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) input, 80);
sph_bmw512_close(&ctx_bmw, (void*) hash);
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
if (hash[0] & 0x8)
{
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
}
else
{
sph_skein512_init(&ctx_skein);
// ZSKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
}
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
if (hash[0] & 0x8)
{
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, (const void*) hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
}
else
{
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
}
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_skein512_init(&ctx_skein);
// SKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
if (hash[0] & 0x8)
{
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
}
else
{
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
}
memcpy(state, hash, 32);
}
struct HashPredicate
{
HashPredicate(uint32_t *hashes, uint32_t startNonce) :
m_hashes(hashes),
m_startNonce(startNonce)
{ }
__device__
bool operator()(const uint32_t x)
{
uint32_t *hash = &m_hashes[(x - m_startNonce)*16];
return hash[0] & 0x8;
}
uint32_t *m_hashes;
uint32_t m_startNonce;
};
extern bool opt_benchmark;
extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*2048; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_animeNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_bmw512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
do {
int order = 0;
size_t nrm1=0, nrm2=0, nrm3=0;
// erstes BMW512 Hash mit CUDA
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
d_branch3Nonces[thr_id], &nrm3,
order++);
// nur den Skein Branch weiterverfolgen
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für Bmw512
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
animehash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/2;
return 1;
} else {
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = (pdata[19] - first_nonce + 1)/2;
return 0;
}

473
quark/cuda_bmw512.cu

@ -0,0 +1,473 @@
#if 1
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
// Endian Drehung für 32 Bit Typen
/*
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
*/
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
// Endian Drehung für 64 Bit Typen
static __device__ unsigned long long cuda_swab64(unsigned long long x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((unsigned long long)cuda_swab32(l)) << 32) | ((unsigned long long)cuda_swab32(h));
}
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_HIWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((unsigned long long)y) << 32ULL);
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const unsigned long long &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
static __device__ unsigned long long MAKE_ULONGLONG(uint32_t LO, uint32_t HI)
{
#if __CUDA_ARCH__ >= 130
return __double_as_longlong(__hiloint2double(HI, LO));
#else
return (unsigned long long)LO | (((unsigned long long)HI) << 32ULL);
#endif
}
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ unsigned long long REPLACE_LOWORD(const unsigned long long &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((unsigned long long)y);
}
// der Versuch, einen Wrapper für einen aus 32 Bit Registern zusammengesetzten uin64_t Typen zu entferfen...
#if 1
typedef unsigned long long uint64_t;
#else
typedef class uint64
{
public:
__device__ uint64()
{
}
__device__ uint64(unsigned long long init)
{
val = make_uint2( LOWORD(init), HIWORD(init) );
}
__device__ uint64(uint32_t lo, uint32_t hi)
{
val = make_uint2( lo, hi );
}
__device__ const uint64 operator^(uint64 const& rhs) const
{
return uint64(val.x ^ rhs.val.x, val.y ^ rhs.val.y);
}
__device__ const uint64 operator|(uint64 const& rhs) const
{
return uint64(val.x | rhs.val.x, val.y | rhs.val.y);
}
__device__ const uint64 operator+(unsigned long long const& rhs) const
{
return *this+uint64(rhs);
}
__device__ const uint64 operator+(uint64 const& rhs) const
{
uint64 res;
asm ("add.cc.u32 %0, %2, %4;\n\t"
"addc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator-(uint64 const& rhs) const
{
uint64 res;
asm ("sub.cc.u32 %0, %2, %4;\n\t"
"subc.cc.u32 %1, %3, %5;\n\t"
: "=r"(res.val.x), "=r"(res.val.y)
: "r"( val.x), "r"( val.y),
"r"(rhs.val.x), "r"(rhs.val.y));
return res;
}
__device__ const uint64 operator<<(int n) const
{
return uint64(unsigned long long(*this)<<n);
}
__device__ const uint64 operator>>(int n) const
{
return uint64(unsigned long long(*this)>>n);
}
__device__ operator unsigned long long() const
{
return MAKE_ULONGLONG(val.x, val.y);
}
uint2 val;
} uint64_t;
#endif
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
#define SPH_C64(x) ((uint64_t)(x ## ULL))
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
#if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result;
if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif
#define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n))
#define CONST_EXP2 q[i+0] + ROTL64(q[i+1], 5) + q[i+2] + ROTL64(q[i+3], 11) + \
q[i+4] + ROTL64(q[i+5], 27) + q[i+6] + ROTL64(q[i+7], 32) + \
q[i+8] + ROTL64(q[i+9], 37) + q[i+10] + ROTL64(q[i+11], 43) + \
q[i+12] + ROTL64(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15])
__device__ void Compression512(uint64_t *msg, uint64_t *hash)
{
// Compression ref. implementation
uint64_t tmp;
uint64_t q[32];
tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]);
q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[1];
tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]);
q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[2];
tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[3];
tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]);
q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[4];
tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]);
q[4] = (SHR(tmp, 1) ^ tmp) + hash[5];
tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]);
q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[6];
tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]);
q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[7];
tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]);
q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[8];
tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]);
q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[9];
tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]);
q[9] = (SHR(tmp, 1) ^ tmp) + hash[10];
tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]);
q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[11];
tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]);
q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROTL64(tmp, 13) ^ ROTL64(tmp, 43)) + hash[12];
tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]);
q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROTL64(tmp, 19) ^ ROTL64(tmp, 53)) + hash[13];
tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]);
q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROTL64(tmp, 28) ^ ROTL64(tmp, 59)) + hash[14];
tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]);
q[14] = (SHR(tmp, 1) ^ tmp) + hash[15];
tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]);
q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROTL64(tmp, 4) ^ ROTL64(tmp, 37)) + hash[0];
// Expand 1
#pragma unroll 2
for(int i=0;i<2;i++)
{
q[i+16] =
(SHR(q[i], 1) ^ SHL(q[i], 2) ^ ROTL64(q[i], 13) ^ ROTL64(q[i], 43)) +
(SHR(q[i+1], 2) ^ SHL(q[i+1], 1) ^ ROTL64(q[i+1], 19) ^ ROTL64(q[i+1], 53)) +
(SHR(q[i+2], 2) ^ SHL(q[i+2], 2) ^ ROTL64(q[i+2], 28) ^ ROTL64(q[i+2], 59)) +
(SHR(q[i+3], 1) ^ SHL(q[i+3], 3) ^ ROTL64(q[i+3], 4) ^ ROTL64(q[i+3], 37)) +
(SHR(q[i+4], 1) ^ SHL(q[i+4], 2) ^ ROTL64(q[i+4], 13) ^ ROTL64(q[i+4], 43)) +
(SHR(q[i+5], 2) ^ SHL(q[i+5], 1) ^ ROTL64(q[i+5], 19) ^ ROTL64(q[i+5], 53)) +
(SHR(q[i+6], 2) ^ SHL(q[i+6], 2) ^ ROTL64(q[i+6], 28) ^ ROTL64(q[i+6], 59)) +
(SHR(q[i+7], 1) ^ SHL(q[i+7], 3) ^ ROTL64(q[i+7], 4) ^ ROTL64(q[i+7], 37)) +
(SHR(q[i+8], 1) ^ SHL(q[i+8], 2) ^ ROTL64(q[i+8], 13) ^ ROTL64(q[i+8], 43)) +
(SHR(q[i+9], 2) ^ SHL(q[i+9], 1) ^ ROTL64(q[i+9], 19) ^ ROTL64(q[i+9], 53)) +
(SHR(q[i+10], 2) ^ SHL(q[i+10], 2) ^ ROTL64(q[i+10], 28) ^ ROTL64(q[i+10], 59)) +
(SHR(q[i+11], 1) ^ SHL(q[i+11], 3) ^ ROTL64(q[i+11], 4) ^ ROTL64(q[i+11], 37)) +
(SHR(q[i+12], 1) ^ SHL(q[i+12], 2) ^ ROTL64(q[i+12], 13) ^ ROTL64(q[i+12], 43)) +
(SHR(q[i+13], 2) ^ SHL(q[i+13], 1) ^ ROTL64(q[i+13], 19) ^ ROTL64(q[i+13], 53)) +
(SHR(q[i+14], 2) ^ SHL(q[i+14], 2) ^ ROTL64(q[i+14], 28) ^ ROTL64(q[i+14], 59)) +
(SHR(q[i+15], 1) ^ SHL(q[i+15], 3) ^ ROTL64(q[i+15], 4) ^ ROTL64(q[i+15], 37)) +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
ROTL64(msg[i+3], i+4) - ROTL64(msg[i+10], i+11) ) ^ hash[i+7]);
}
#pragma unroll 4
for(int i=2;i<6;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
ROTL64(msg[i+3], i+4) - ROTL64(msg[i+10], i+11) ) ^ hash[i+7]);
}
#pragma unroll 3
for(int i=6;i<9;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
ROTL64(msg[i+3], i+4) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i+7]);
}
#pragma unroll 4
for(int i=9;i<13;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
ROTL64(msg[i+3], i+4) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i-9]);
}
#pragma unroll 3
for(int i=13;i<16;i++) {
q[i+16] = CONST_EXP2 +
(( ((i+16)*(0x0555555555555555ull)) + ROTL64(msg[i], i+1) +
ROTL64(msg[i-13], (i-13)+1) - ROTL64(msg[i-6], (i-6)+1) ) ^ hash[i-9]);
}
uint64_t XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23];
uint64_t XH64 = XL64^q[24]^q[25]^q[26]^q[27]^q[28]^q[29]^q[30]^q[31];
hash[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ msg[ 0]) + ( XL64 ^ q[24] ^ q[ 0]);
hash[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ msg[ 1]) + ( XL64 ^ q[25] ^ q[ 1]);
hash[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ msg[ 2]) + ( XL64 ^ q[26] ^ q[ 2]);
hash[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ msg[ 3]) + ( XL64 ^ q[27] ^ q[ 3]);
hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[ 4]) + ( XL64 ^ q[28] ^ q[ 4]);
hash[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ msg[ 5]) + ( XL64 ^ q[29] ^ q[ 5]);
hash[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ msg[ 6]) + ( XL64 ^ q[30] ^ q[ 6]);
hash[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ msg[ 7]) + ( XL64 ^ q[31] ^ q[ 7]);
hash[ 8] = ROTL64(hash[4], 9) + ( XH64 ^ q[24] ^ msg[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]);
hash[ 9] = ROTL64(hash[5],10) + ( XH64 ^ q[25] ^ msg[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]);
hash[10] = ROTL64(hash[6],11) + ( XH64 ^ q[26] ^ msg[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]);
hash[11] = ROTL64(hash[7],12) + ( XH64 ^ q[27] ^ msg[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]);
hash[12] = ROTL64(hash[0],13) + ( XH64 ^ q[28] ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]);
hash[13] = ROTL64(hash[1],14) + ( XH64 ^ q[29] ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]);
hash[14] = ROTL64(hash[2],15) + ( XH64 ^ q[30] ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]);
hash[15] = ROTL64(hash[3],16) + ( XH64 ^ q[31] ^ msg[15]) + (SHR(XL64,2) ^ q[22] ^ q[15]);
}
static __constant__ uint64_t d_constMem[16];
static uint64_t h_constMem[16] = {
SPH_C64(0x8081828384858687),
SPH_C64(0x88898A8B8C8D8E8F),
SPH_C64(0x9091929394959697),
SPH_C64(0x98999A9B9C9D9E9F),
SPH_C64(0xA0A1A2A3A4A5A6A7),
SPH_C64(0xA8A9AAABACADAEAF),
SPH_C64(0xB0B1B2B3B4B5B6B7),
SPH_C64(0xB8B9BABBBCBDBEBF),
SPH_C64(0xC0C1C2C3C4C5C6C7),
SPH_C64(0xC8C9CACBCCCDCECF),
SPH_C64(0xD0D1D2D3D4D5D6D7),
SPH_C64(0xD8D9DADBDCDDDEDF),
SPH_C64(0xE0E1E2E3E4E5E6E7),
SPH_C64(0xE8E9EAEBECEDEEEF),
SPH_C64(0xF0F1F2F3F4F5F6F7),
SPH_C64(0xF8F9FAFBFCFDFEFF)
};
__global__ void quark_bmw512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint64_t *inpHash = &g_hash[8 * hashPosition];
// Init
uint64_t h[16];
/*
h[ 0] = SPH_C64(0x8081828384858687);
h[ 1] = SPH_C64(0x88898A8B8C8D8E8F);
h[ 2] = SPH_C64(0x9091929394959697);
h[ 3] = SPH_C64(0x98999A9B9C9D9E9F);
h[ 4] = SPH_C64(0xA0A1A2A3A4A5A6A7);
h[ 5] = SPH_C64(0xA8A9AAABACADAEAF);
h[ 6] = SPH_C64(0xB0B1B2B3B4B5B6B7);
h[ 7] = SPH_C64(0xB8B9BABBBCBDBEBF);
h[ 8] = SPH_C64(0xC0C1C2C3C4C5C6C7);
h[ 9] = SPH_C64(0xC8C9CACBCCCDCECF);
h[10] = SPH_C64(0xD0D1D2D3D4D5D6D7);
h[11] = SPH_C64(0xD8D9DADBDCDDDEDF);
h[12] = SPH_C64(0xE0E1E2E3E4E5E6E7);
h[13] = SPH_C64(0xE8E9EAEBECEDEEEF);
h[14] = SPH_C64(0xF0F1F2F3F4F5F6F7);
h[15] = SPH_C64(0xF8F9FAFBFCFDFEFF);
*/
#pragma unroll 16
for(int i=0;i<16;i++)
h[i] = d_constMem[i];
// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
// BMW arbeitet mit 128 Byte!!!
uint64_t message[16];
#pragma unroll 8
for(int i=0;i<8;i++)
message[i] = inpHash[i];
#pragma unroll 6
for(int i=9;i<15;i++)
message[i] = 0;
// Padding einfügen (Byteorder?!?)
message[8] = SPH_C64(0x80);
// Länge (in Bits, d.h. 64 Byte * 8 = 512 Bits
message[15] = SPH_C64(512);
// Compression 1
Compression512(message, h);
// Final
#pragma unroll 16
for(int i=0;i<16;i++)
message[i] = 0xaaaaaaaaaaaaaaa0ull + (uint64_t)i;
Compression512(h, message);
// fertig
uint64_t *outpHash = &g_hash[8 * hashPosition];
#pragma unroll 8
for(int i=0;i<8;i++)
outpHash[i] = message[i+8];
}
}
__global__ void quark_bmw512_gpu_hash_80(int threads, uint32_t startNounce, uint64_t *g_hash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
// Init
uint64_t h[16];
#pragma unroll 16
for(int i=0;i<16;i++)
h[i] = d_constMem[i];
// Nachricht kopieren (Achtung, die Nachricht hat 64 Byte,
// BMW arbeitet mit 128 Byte!!!
uint64_t message[16];
#pragma unroll 16
for(int i=0;i<16;i++)
message[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen
message[9] = REPLACE_HIWORD(message[9], cuda_swab32(nounce));
// Compression 1
Compression512(message, h);
// Final
#pragma unroll 16
for(int i=0;i<16;i++)
message[i] = 0xaaaaaaaaaaaaaaa0ull + (uint64_t)i;
Compression512(h, message);
// fertig
uint64_t *outpHash = &g_hash[8 * thread];
#pragma unroll 8
for(int i=0;i<8;i++)
outpHash[i] = message[i+8];
}
}
// Setup-Funktionen
__host__ void quark_bmw512_cpu_init(int thr_id, int threads)
{
// nix zu tun ;-)
// jetzt schon :D
cudaMemcpyToSymbol( d_constMem,
h_constMem,
sizeof(h_constMem),
0, cudaMemcpyHostToDevice);
}
// Bmw512 für 80 Byte grosse Eingangsdaten
__host__ void quark_bmw512_cpu_setBlock_80(void *pdata)
{
// Message mit Padding bereitstellen
// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
unsigned char PaddedMessage[128];
memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage+80, 0, 48);
uint64_t *message = (uint64_t*)PaddedMessage;
// Padding einfügen (Byteorder?!?)
message[10] = SPH_C64(0x80);
// Länge (in Bits, d.h. 80 Byte * 8 = 640 Bits
message[15] = SPH_C64(640);
// die Message zur Berechnung auf der GPU
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}
__host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id);
}
#endif

714
quark/cuda_jh512.cu

@ -1,356 +1,358 @@
#include <stdint.h> #include <stdint.h>
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef struct { typedef struct {
uint32_t x[8][4]; /*the 1024-bit state, ( x[i][0] || x[i][1] || x[i][2] || x[i][3] ) is the ith row of the state in the pseudocode*/ uint32_t x[8][4]; /*the 1024-bit state, ( x[i][0] || x[i][1] || x[i][2] || x[i][3] ) is the ith row of the state in the pseudocode*/
uint32_t buffer[16]; /*the 512-bit message block to be hashed;*/ uint32_t buffer[16]; /*the 512-bit message block to be hashed;*/
} hashState; } hashState;
/*42 round constants, each round constant is 32-byte (256-bit)*/ /*42 round constants, each round constant is 32-byte (256-bit)*/
__constant__ uint32_t c_INIT_bitslice[8][4]; __constant__ uint32_t c_INIT_bitslice[8][4];
__constant__ unsigned char c_E8_bitslice_roundconstant[42][32]; __constant__ unsigned char c_E8_bitslice_roundconstant[42][32];
const uint32_t h_INIT_bitslice[8][4] = { const uint32_t h_INIT_bitslice[8][4] = {
{ 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a}, { 0x964bd16f, 0x17aa003e, 0x052e6a63, 0x43d5157a},
{ 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2}, { 0x8d5e228a, 0x0bef970c, 0x591234e9, 0x61c3b3f2},
{ 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea}, { 0xc1a01d89, 0x1e806f53, 0x6b05a92a, 0x806d2bea},
{ 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba}, { 0xdbcc8e58, 0xa6ba7520, 0x763a0fa9, 0xf73bf8ba},
{ 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e}, { 0x05e66901, 0x694ae341, 0x8e8ab546, 0x5ae66f2e},
{ 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d}, { 0xd0a74710, 0x243c84c1, 0xb1716e3b, 0x99c15a2d},
{ 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657}, { 0xecf657cf, 0x56f8b19d, 0x7c8806a7, 0x56b11657},
{ 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc} }; { 0xdffcc2e3, 0xfb1785e6, 0x78465a54, 0x4bdd8ccc} };
const unsigned char h_E8_bitslice_roundconstant[42][32]={ const unsigned char h_E8_bitslice_roundconstant[42][32]={
{0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40}, {0x72,0xd5,0xde,0xa2,0xdf,0x15,0xf8,0x67,0x7b,0x84,0x15,0xa,0xb7,0x23,0x15,0x57,0x81,0xab,0xd6,0x90,0x4d,0x5a,0x87,0xf6,0x4e,0x9f,0x4f,0xc5,0xc3,0xd1,0x2b,0x40},
{0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31}, {0xea,0x98,0x3a,0xe0,0x5c,0x45,0xfa,0x9c,0x3,0xc5,0xd2,0x99,0x66,0xb2,0x99,0x9a,0x66,0x2,0x96,0xb4,0xf2,0xbb,0x53,0x8a,0xb5,0x56,0x14,0x1a,0x88,0xdb,0xa2,0x31},
{0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc}, {0x3,0xa3,0x5a,0x5c,0x9a,0x19,0xe,0xdb,0x40,0x3f,0xb2,0xa,0x87,0xc1,0x44,0x10,0x1c,0x5,0x19,0x80,0x84,0x9e,0x95,0x1d,0x6f,0x33,0xeb,0xad,0x5e,0xe7,0xcd,0xdc},
{0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3}, {0x10,0xba,0x13,0x92,0x2,0xbf,0x6b,0x41,0xdc,0x78,0x65,0x15,0xf7,0xbb,0x27,0xd0,0xa,0x2c,0x81,0x39,0x37,0xaa,0x78,0x50,0x3f,0x1a,0xbf,0xd2,0x41,0x0,0x91,0xd3},
{0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23}, {0x42,0x2d,0x5a,0xd,0xf6,0xcc,0x7e,0x90,0xdd,0x62,0x9f,0x9c,0x92,0xc0,0x97,0xce,0x18,0x5c,0xa7,0xb,0xc7,0x2b,0x44,0xac,0xd1,0xdf,0x65,0xd6,0x63,0xc6,0xfc,0x23},
{0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97}, {0x97,0x6e,0x6c,0x3,0x9e,0xe0,0xb8,0x1a,0x21,0x5,0x45,0x7e,0x44,0x6c,0xec,0xa8,0xee,0xf1,0x3,0xbb,0x5d,0x8e,0x61,0xfa,0xfd,0x96,0x97,0xb2,0x94,0x83,0x81,0x97},
{0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14}, {0x4a,0x8e,0x85,0x37,0xdb,0x3,0x30,0x2f,0x2a,0x67,0x8d,0x2d,0xfb,0x9f,0x6a,0x95,0x8a,0xfe,0x73,0x81,0xf8,0xb8,0x69,0x6c,0x8a,0xc7,0x72,0x46,0xc0,0x7f,0x42,0x14},
{0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4}, {0xc5,0xf4,0x15,0x8f,0xbd,0xc7,0x5e,0xc4,0x75,0x44,0x6f,0xa7,0x8f,0x11,0xbb,0x80,0x52,0xde,0x75,0xb7,0xae,0xe4,0x88,0xbc,0x82,0xb8,0x0,0x1e,0x98,0xa6,0xa3,0xf4},
{0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36}, {0x8e,0xf4,0x8f,0x33,0xa9,0xa3,0x63,0x15,0xaa,0x5f,0x56,0x24,0xd5,0xb7,0xf9,0x89,0xb6,0xf1,0xed,0x20,0x7c,0x5a,0xe0,0xfd,0x36,0xca,0xe9,0x5a,0x6,0x42,0x2c,0x36},
{0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f}, {0xce,0x29,0x35,0x43,0x4e,0xfe,0x98,0x3d,0x53,0x3a,0xf9,0x74,0x73,0x9a,0x4b,0xa7,0xd0,0xf5,0x1f,0x59,0x6f,0x4e,0x81,0x86,0xe,0x9d,0xad,0x81,0xaf,0xd8,0x5a,0x9f},
{0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b}, {0xa7,0x5,0x6,0x67,0xee,0x34,0x62,0x6a,0x8b,0xb,0x28,0xbe,0x6e,0xb9,0x17,0x27,0x47,0x74,0x7,0x26,0xc6,0x80,0x10,0x3f,0xe0,0xa0,0x7e,0x6f,0xc6,0x7e,0x48,0x7b},
{0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62}, {0xd,0x55,0xa,0xa5,0x4a,0xf8,0xa4,0xc0,0x91,0xe3,0xe7,0x9f,0x97,0x8e,0xf1,0x9e,0x86,0x76,0x72,0x81,0x50,0x60,0x8d,0xd4,0x7e,0x9e,0x5a,0x41,0xf3,0xe5,0xb0,0x62},
{0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5}, {0xfc,0x9f,0x1f,0xec,0x40,0x54,0x20,0x7a,0xe3,0xe4,0x1a,0x0,0xce,0xf4,0xc9,0x84,0x4f,0xd7,0x94,0xf5,0x9d,0xfa,0x95,0xd8,0x55,0x2e,0x7e,0x11,0x24,0xc3,0x54,0xa5},
{0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f}, {0x5b,0xdf,0x72,0x28,0xbd,0xfe,0x6e,0x28,0x78,0xf5,0x7f,0xe2,0xf,0xa5,0xc4,0xb2,0x5,0x89,0x7c,0xef,0xee,0x49,0xd3,0x2e,0x44,0x7e,0x93,0x85,0xeb,0x28,0x59,0x7f},
{0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a}, {0x70,0x5f,0x69,0x37,0xb3,0x24,0x31,0x4a,0x5e,0x86,0x28,0xf1,0x1d,0xd6,0xe4,0x65,0xc7,0x1b,0x77,0x4,0x51,0xb9,0x20,0xe7,0x74,0xfe,0x43,0xe8,0x23,0xd4,0x87,0x8a},
{0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf}, {0x7d,0x29,0xe8,0xa3,0x92,0x76,0x94,0xf2,0xdd,0xcb,0x7a,0x9,0x9b,0x30,0xd9,0xc1,0x1d,0x1b,0x30,0xfb,0x5b,0xdc,0x1b,0xe0,0xda,0x24,0x49,0x4f,0xf2,0x9c,0x82,0xbf},
{0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0}, {0xa4,0xe7,0xba,0x31,0xb4,0x70,0xbf,0xff,0xd,0x32,0x44,0x5,0xde,0xf8,0xbc,0x48,0x3b,0xae,0xfc,0x32,0x53,0xbb,0xd3,0x39,0x45,0x9f,0xc3,0xc1,0xe0,0x29,0x8b,0xa0},
{0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a}, {0xe5,0xc9,0x5,0xfd,0xf7,0xae,0x9,0xf,0x94,0x70,0x34,0x12,0x42,0x90,0xf1,0x34,0xa2,0x71,0xb7,0x1,0xe3,0x44,0xed,0x95,0xe9,0x3b,0x8e,0x36,0x4f,0x2f,0x98,0x4a},
{0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6}, {0x88,0x40,0x1d,0x63,0xa0,0x6c,0xf6,0x15,0x47,0xc1,0x44,0x4b,0x87,0x52,0xaf,0xff,0x7e,0xbb,0x4a,0xf1,0xe2,0xa,0xc6,0x30,0x46,0x70,0xb6,0xc5,0xcc,0x6e,0x8c,0xe6},
{0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67}, {0xa4,0xd5,0xa4,0x56,0xbd,0x4f,0xca,0x0,0xda,0x9d,0x84,0x4b,0xc8,0x3e,0x18,0xae,0x73,0x57,0xce,0x45,0x30,0x64,0xd1,0xad,0xe8,0xa6,0xce,0x68,0x14,0x5c,0x25,0x67},
{0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18}, {0xa3,0xda,0x8c,0xf2,0xcb,0xe,0xe1,0x16,0x33,0xe9,0x6,0x58,0x9a,0x94,0x99,0x9a,0x1f,0x60,0xb2,0x20,0xc2,0x6f,0x84,0x7b,0xd1,0xce,0xac,0x7f,0xa0,0xd1,0x85,0x18},
{0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e}, {0x32,0x59,0x5b,0xa1,0x8d,0xdd,0x19,0xd3,0x50,0x9a,0x1c,0xc0,0xaa,0xa5,0xb4,0x46,0x9f,0x3d,0x63,0x67,0xe4,0x4,0x6b,0xba,0xf6,0xca,0x19,0xab,0xb,0x56,0xee,0x7e},
{0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1}, {0x1f,0xb1,0x79,0xea,0xa9,0x28,0x21,0x74,0xe9,0xbd,0xf7,0x35,0x3b,0x36,0x51,0xee,0x1d,0x57,0xac,0x5a,0x75,0x50,0xd3,0x76,0x3a,0x46,0xc2,0xfe,0xa3,0x7d,0x70,0x1},
{0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83}, {0xf7,0x35,0xc1,0xaf,0x98,0xa4,0xd8,0x42,0x78,0xed,0xec,0x20,0x9e,0x6b,0x67,0x79,0x41,0x83,0x63,0x15,0xea,0x3a,0xdb,0xa8,0xfa,0xc3,0x3b,0x4d,0x32,0x83,0x2c,0x83},
{0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef}, {0xa7,0x40,0x3b,0x1f,0x1c,0x27,0x47,0xf3,0x59,0x40,0xf0,0x34,0xb7,0x2d,0x76,0x9a,0xe7,0x3e,0x4e,0x6c,0xd2,0x21,0x4f,0xfd,0xb8,0xfd,0x8d,0x39,0xdc,0x57,0x59,0xef},
{0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65}, {0x8d,0x9b,0xc,0x49,0x2b,0x49,0xeb,0xda,0x5b,0xa2,0xd7,0x49,0x68,0xf3,0x70,0xd,0x7d,0x3b,0xae,0xd0,0x7a,0x8d,0x55,0x84,0xf5,0xa5,0xe9,0xf0,0xe4,0xf8,0x8e,0x65},
{0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c}, {0xa0,0xb8,0xa2,0xf4,0x36,0x10,0x3b,0x53,0xc,0xa8,0x7,0x9e,0x75,0x3e,0xec,0x5a,0x91,0x68,0x94,0x92,0x56,0xe8,0x88,0x4f,0x5b,0xb0,0x5c,0x55,0xf8,0xba,0xbc,0x4c},
{0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71}, {0xe3,0xbb,0x3b,0x99,0xf3,0x87,0x94,0x7b,0x75,0xda,0xf4,0xd6,0x72,0x6b,0x1c,0x5d,0x64,0xae,0xac,0x28,0xdc,0x34,0xb3,0x6d,0x6c,0x34,0xa5,0x50,0xb8,0x28,0xdb,0x71},
{0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0}, {0xf8,0x61,0xe2,0xf2,0x10,0x8d,0x51,0x2a,0xe3,0xdb,0x64,0x33,0x59,0xdd,0x75,0xfc,0x1c,0xac,0xbc,0xf1,0x43,0xce,0x3f,0xa2,0x67,0xbb,0xd1,0x3c,0x2,0xe8,0x43,0xb0},
{0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f}, {0x33,0xa,0x5b,0xca,0x88,0x29,0xa1,0x75,0x7f,0x34,0x19,0x4d,0xb4,0x16,0x53,0x5c,0x92,0x3b,0x94,0xc3,0xe,0x79,0x4d,0x1e,0x79,0x74,0x75,0xd7,0xb6,0xee,0xaf,0x3f},
{0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad}, {0xea,0xa8,0xd4,0xf7,0xbe,0x1a,0x39,0x21,0x5c,0xf4,0x7e,0x9,0x4c,0x23,0x27,0x51,0x26,0xa3,0x24,0x53,0xba,0x32,0x3c,0xd2,0x44,0xa3,0x17,0x4a,0x6d,0xa6,0xd5,0xad},
{0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6}, {0xb5,0x1d,0x3e,0xa6,0xaf,0xf2,0xc9,0x8,0x83,0x59,0x3d,0x98,0x91,0x6b,0x3c,0x56,0x4c,0xf8,0x7c,0xa1,0x72,0x86,0x60,0x4d,0x46,0xe2,0x3e,0xcc,0x8,0x6e,0xc7,0xf6},
{0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63}, {0x2f,0x98,0x33,0xb3,0xb1,0xbc,0x76,0x5e,0x2b,0xd6,0x66,0xa5,0xef,0xc4,0xe6,0x2a,0x6,0xf4,0xb6,0xe8,0xbe,0xc1,0xd4,0x36,0x74,0xee,0x82,0x15,0xbc,0xef,0x21,0x63},
{0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f}, {0xfd,0xc1,0x4e,0xd,0xf4,0x53,0xc9,0x69,0xa7,0x7d,0x5a,0xc4,0x6,0x58,0x58,0x26,0x7e,0xc1,0x14,0x16,0x6,0xe0,0xfa,0x16,0x7e,0x90,0xaf,0x3d,0x28,0x63,0x9d,0x3f},
{0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a}, {0xd2,0xc9,0xf2,0xe3,0x0,0x9b,0xd2,0xc,0x5f,0xaa,0xce,0x30,0xb7,0xd4,0xc,0x30,0x74,0x2a,0x51,0x16,0xf2,0xe0,0x32,0x98,0xd,0xeb,0x30,0xd8,0xe3,0xce,0xf8,0x9a},
{0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5}, {0x4b,0xc5,0x9e,0x7b,0xb5,0xf1,0x79,0x92,0xff,0x51,0xe6,0x6e,0x4,0x86,0x68,0xd3,0x9b,0x23,0x4d,0x57,0xe6,0x96,0x67,0x31,0xcc,0xe6,0xa6,0xf3,0x17,0xa,0x75,0x5},
{0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48}, {0xb1,0x76,0x81,0xd9,0x13,0x32,0x6c,0xce,0x3c,0x17,0x52,0x84,0xf8,0x5,0xa2,0x62,0xf4,0x2b,0xcb,0xb3,0x78,0x47,0x15,0x47,0xff,0x46,0x54,0x82,0x23,0x93,0x6a,0x48},
{0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e}, {0x38,0xdf,0x58,0x7,0x4e,0x5e,0x65,0x65,0xf2,0xfc,0x7c,0x89,0xfc,0x86,0x50,0x8e,0x31,0x70,0x2e,0x44,0xd0,0xb,0xca,0x86,0xf0,0x40,0x9,0xa2,0x30,0x78,0x47,0x4e},
{0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7}, {0x65,0xa0,0xee,0x39,0xd1,0xf7,0x38,0x83,0xf7,0x5e,0xe9,0x37,0xe4,0x2c,0x3a,0xbd,0x21,0x97,0xb2,0x26,0x1,0x13,0xf8,0x6f,0xa3,0x44,0xed,0xd1,0xef,0x9f,0xde,0xe7},
{0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde}, {0x8b,0xa0,0xdf,0x15,0x76,0x25,0x92,0xd9,0x3c,0x85,0xf7,0xf6,0x12,0xdc,0x42,0xbe,0xd8,0xa7,0xec,0x7c,0xab,0x27,0xb0,0x7e,0x53,0x8d,0x7d,0xda,0xaa,0x3e,0xa8,0xde},
{0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a}, {0xaa,0x25,0xce,0x93,0xbd,0x2,0x69,0xd8,0x5a,0xf6,0x43,0xfd,0x1a,0x73,0x8,0xf9,0xc0,0x5f,0xef,0xda,0x17,0x4a,0x19,0xa5,0x97,0x4d,0x66,0x33,0x4c,0xfd,0x21,0x6a},
{0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}}; {0x35,0xb4,0x98,0x31,0xdb,0x41,0x15,0x70,0xea,0x1e,0xf,0xbb,0xed,0xcd,0x54,0x9b,0x9a,0xd0,0x63,0xa1,0x51,0x97,0x40,0x72,0xf6,0x75,0x9d,0xbf,0x91,0x47,0x6f,0xe2}};
/*swapping bit 2i with bit 2i+1 of 32-bit x*/ /*swapping bit 2i with bit 2i+1 of 32-bit x*/
#define SWAP1(x) (x) = ((((x) & 0x55555555UL) << 1) | (((x) & 0xaaaaaaaaUL) >> 1)); #define SWAP1(x) (x) = ((((x) & 0x55555555UL) << 1) | (((x) & 0xaaaaaaaaUL) >> 1));
/*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 32-bit x*/ /*swapping bits 4i||4i+1 with bits 4i+2||4i+3 of 32-bit x*/
#define SWAP2(x) (x) = ((((x) & 0x33333333UL) << 2) | (((x) & 0xccccccccUL) >> 2)); #define SWAP2(x) (x) = ((((x) & 0x33333333UL) << 2) | (((x) & 0xccccccccUL) >> 2));
/*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 32-bit x*/ /*swapping bits 8i||8i+1||8i+2||8i+3 with bits 8i+4||8i+5||8i+6||8i+7 of 32-bit x*/
#define SWAP4(x) (x) = ((((x) & 0x0f0f0f0fUL) << 4) | (((x) & 0xf0f0f0f0UL) >> 4)); #define SWAP4(x) (x) = ((((x) & 0x0f0f0f0fUL) << 4) | (((x) & 0xf0f0f0f0UL) >> 4));
/*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/ /*swapping bits 16i||16i+1||......||16i+7 with bits 16i+8||16i+9||......||16i+15 of 32-bit x*/
#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8)); //#define SWAP8(x) (x) = ((((x) & 0x00ff00ffUL) << 8) | (((x) & 0xff00ff00UL) >> 8));
/*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/ #define SWAP8(x) (x) = __byte_perm(x, x, 0x2301);
#define SWAP16(x) (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16)); /*swapping bits 32i||32i+1||......||32i+15 with bits 32i+16||32i+17||......||32i+31 of 32-bit x*/
//#define SWAP16(x) (x) = ((((x) & 0x0000ffffUL) << 16) | (((x) & 0xffff0000UL) >> 16));
/*The MDS transform*/ #define SWAP16(x) (x) = __byte_perm(x, x, 0x1032);
#define L(m0,m1,m2,m3,m4,m5,m6,m7) \
(m4) ^= (m1); \ /*The MDS transform*/
(m5) ^= (m2); \ #define L(m0,m1,m2,m3,m4,m5,m6,m7) \
(m6) ^= (m0) ^ (m3); \ (m4) ^= (m1); \
(m7) ^= (m0); \ (m5) ^= (m2); \
(m0) ^= (m5); \ (m6) ^= (m0) ^ (m3); \
(m1) ^= (m6); \ (m7) ^= (m0); \
(m2) ^= (m4) ^ (m7); \ (m0) ^= (m5); \
(m3) ^= (m4); (m1) ^= (m6); \
(m2) ^= (m4) ^ (m7); \
/*The Sbox*/ (m3) ^= (m4);
#define Sbox(m0,m1,m2,m3,cc) \
m3 = ~(m3); \ /*The Sbox*/
m0 ^= ((~(m2)) & (cc)); \ #define Sbox(m0,m1,m2,m3,cc) \
temp0 = (cc) ^ ((m0) & (m1));\ m3 = ~(m3); \
m0 ^= ((m2) & (m3)); \ m0 ^= ((~(m2)) & (cc)); \
m3 ^= ((~(m1)) & (m2)); \ temp0 = (cc) ^ ((m0) & (m1));\
m1 ^= ((m0) & (m2)); \ m0 ^= ((m2) & (m3)); \
m2 ^= ((m0) & (~(m3))); \ m3 ^= ((~(m1)) & (m2)); \
m0 ^= ((m1) | (m3)); \ m1 ^= ((m0) & (m2)); \
m3 ^= ((m1) & (m2)); \ m2 ^= ((m0) & (~(m3))); \
m1 ^= (temp0 & (m0)); \ m0 ^= ((m1) | (m3)); \
m2 ^= temp0; m3 ^= ((m1) & (m2)); \
m1 ^= (temp0 & (m0)); \
__device__ __forceinline__ void Sbox_and_MDS_layer(hashState* state, uint32_t roundnumber) m2 ^= temp0;
{
uint32_t temp0; __device__ __forceinline__ void Sbox_and_MDS_layer(hashState* state, uint32_t roundnumber)
uint32_t cc0, cc1; {
//Sbox and MDS layer uint32_t temp0;
#pragma unroll 4 uint32_t cc0, cc1;
for (int i = 0; i < 4; i++) { //Sbox and MDS layer
cc0 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i]; #pragma unroll 4
cc1 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i+4]; for (int i = 0; i < 4; i++) {
Sbox(state->x[0][i],state->x[2][i], state->x[4][i], state->x[6][i], cc0); cc0 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i];
Sbox(state->x[1][i],state->x[3][i], state->x[5][i], state->x[7][i], cc1); cc1 = ((uint32_t*)c_E8_bitslice_roundconstant[roundnumber])[i+4];
L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]); Sbox(state->x[0][i],state->x[2][i], state->x[4][i], state->x[6][i], cc0);
} Sbox(state->x[1][i],state->x[3][i], state->x[5][i], state->x[7][i], cc1);
} L(state->x[0][i],state->x[2][i],state->x[4][i],state->x[6][i],state->x[1][i],state->x[3][i],state->x[5][i],state->x[7][i]);
}
__device__ __forceinline__ void RoundFunction0(hashState* state, uint32_t roundnumber) }
{
Sbox_and_MDS_layer(state, roundnumber); __device__ __forceinline__ void RoundFunction0(hashState* state, uint32_t roundnumber)
{
#pragma unroll 4 Sbox_and_MDS_layer(state, roundnumber);
for (int j = 1; j < 8; j = j+2)
{ #pragma unroll 4
#pragma unroll 4 for (int j = 1; j < 8; j = j+2)
for (int i = 0; i < 4; i++) SWAP1(state->x[j][i]); {
} #pragma unroll 4
} for (int i = 0; i < 4; i++) SWAP1(state->x[j][i]);
}
__device__ __forceinline__ void RoundFunction1(hashState* state, uint32_t roundnumber) }
{
Sbox_and_MDS_layer(state, roundnumber); __device__ __forceinline__ void RoundFunction1(hashState* state, uint32_t roundnumber)
{
#pragma unroll 4 Sbox_and_MDS_layer(state, roundnumber);
for (int j = 1; j < 8; j = j+2)
{ #pragma unroll 4
#pragma unroll 4 for (int j = 1; j < 8; j = j+2)
for (int i = 0; i < 4; i++) SWAP2(state->x[j][i]); {
} #pragma unroll 4
} for (int i = 0; i < 4; i++) SWAP2(state->x[j][i]);
}
__device__ __forceinline__ void RoundFunction2(hashState* state, uint32_t roundnumber) }
{
Sbox_and_MDS_layer(state, roundnumber); __device__ __forceinline__ void RoundFunction2(hashState* state, uint32_t roundnumber)
{
#pragma unroll 4 Sbox_and_MDS_layer(state, roundnumber);
for (int j = 1; j < 8; j = j+2)
{ #pragma unroll 4
#pragma unroll 4 for (int j = 1; j < 8; j = j+2)
for (int i = 0; i < 4; i++) SWAP4(state->x[j][i]); {
} #pragma unroll 4
} for (int i = 0; i < 4; i++) SWAP4(state->x[j][i]);
}
__device__ __forceinline__ void RoundFunction3(hashState* state, uint32_t roundnumber) }
{
Sbox_and_MDS_layer(state, roundnumber); __device__ __forceinline__ void RoundFunction3(hashState* state, uint32_t roundnumber)
{
#pragma unroll 4 Sbox_and_MDS_layer(state, roundnumber);
for (int j = 1; j < 8; j = j+2)
{ #pragma unroll 4
#pragma unroll 4 for (int j = 1; j < 8; j = j+2)
for (int i = 0; i < 4; i++) SWAP8(state->x[j][i]); {
} #pragma unroll 4
} for (int i = 0; i < 4; i++) SWAP8(state->x[j][i]);
}
__device__ __forceinline__ void RoundFunction4(hashState* state, uint32_t roundnumber) }
{
Sbox_and_MDS_layer(state, roundnumber); __device__ __forceinline__ void RoundFunction4(hashState* state, uint32_t roundnumber)
{
#pragma unroll 4 Sbox_and_MDS_layer(state, roundnumber);
for (int j = 1; j < 8; j = j+2)
{ #pragma unroll 4
#pragma unroll 4 for (int j = 1; j < 8; j = j+2)
for (int i = 0; i < 4; i++) SWAP16(state->x[j][i]); {
} #pragma unroll 4
} for (int i = 0; i < 4; i++) SWAP16(state->x[j][i]);
}
__device__ __forceinline__ void RoundFunction5(hashState* state, uint32_t roundnumber) }
{
uint32_t temp0; __device__ __forceinline__ void RoundFunction5(hashState* state, uint32_t roundnumber)
{
Sbox_and_MDS_layer(state, roundnumber); uint32_t temp0;
#pragma unroll 4 Sbox_and_MDS_layer(state, roundnumber);
for (int j = 1; j < 8; j = j+2)
{ #pragma unroll 4
#pragma unroll 2 for (int j = 1; j < 8; j = j+2)
for (int i = 0; i < 4; i = i+2) { {
temp0 = state->x[j][i]; state->x[j][i] = state->x[j][i+1]; state->x[j][i+1] = temp0; #pragma unroll 2
} for (int i = 0; i < 4; i = i+2) {
} temp0 = state->x[j][i]; state->x[j][i] = state->x[j][i+1]; state->x[j][i+1] = temp0;
} }
}
__device__ __forceinline__ void RoundFunction6(hashState* state, uint32_t roundnumber) }
{
uint32_t temp0; __device__ __forceinline__ void RoundFunction6(hashState* state, uint32_t roundnumber)
{
Sbox_and_MDS_layer(state, roundnumber); uint32_t temp0;
#pragma unroll 4 Sbox_and_MDS_layer(state, roundnumber);
for (int j = 1; j < 8; j = j+2)
{ #pragma unroll 4
#pragma unroll 2 for (int j = 1; j < 8; j = j+2)
for (int i = 0; i < 2; i++) { {
temp0 = state->x[j][i]; state->x[j][i] = state->x[j][i+2]; state->x[j][i+2] = temp0; #pragma unroll 2
} for (int i = 0; i < 2; i++) {
} temp0 = state->x[j][i]; state->x[j][i] = state->x[j][i+2]; state->x[j][i+2] = temp0;
} }
}
/*The bijective function E8, in bitslice form */ }
__device__ __forceinline__ void E8(hashState *state)
{ /*The bijective function E8, in bitslice form */
/*perform 6 rounds*/ __device__ __forceinline__ void E8(hashState *state)
//#pragma unroll 6 {
for (int i = 0; i < 42; i+=7) /*perform 6 rounds*/
{ //#pragma unroll 6
RoundFunction0(state, i); for (int i = 0; i < 42; i+=7)
RoundFunction1(state, i+1); {
RoundFunction2(state, i+2); RoundFunction0(state, i);
RoundFunction3(state, i+3); RoundFunction1(state, i+1);
RoundFunction4(state, i+4); RoundFunction2(state, i+2);
RoundFunction5(state, i+5); RoundFunction3(state, i+3);
RoundFunction6(state, i+6); RoundFunction4(state, i+4);
} RoundFunction5(state, i+5);
} RoundFunction6(state, i+6);
}
/*The compression function F8 */ }
__device__ __forceinline__ void F8(hashState *state)
{ /*The compression function F8 */
/*xor the 512-bit message with the fist half of the 1024-bit hash state*/ __device__ __forceinline__ void F8(hashState *state)
#pragma unroll 16 {
for (int i = 0; i < 16; i++) state->x[i >> 2][i & 3] ^= ((uint32_t*)state->buffer)[i]; /*xor the 512-bit message with the fist half of the 1024-bit hash state*/
#pragma unroll 16
/*the bijective function E8 */ for (int i = 0; i < 16; i++) state->x[i >> 2][i & 3] ^= ((uint32_t*)state->buffer)[i];
E8(state);
/*the bijective function E8 */
/*xor the 512-bit message with the second half of the 1024-bit hash state*/ E8(state);
#pragma unroll 16
for (int i = 0; i < 16; i++) state->x[(16+i) >> 2][(16+i) & 3] ^= ((uint32_t*)state->buffer)[i]; /*xor the 512-bit message with the second half of the 1024-bit hash state*/
} #pragma unroll 16
for (int i = 0; i < 16; i++) state->x[(16+i) >> 2][(16+i) & 3] ^= ((uint32_t*)state->buffer)[i];
}
__device__ __forceinline__ void JHHash(const uint32_t *data, uint32_t *hashval)
{
hashState state; __device__ __forceinline__ void JHHash(const uint32_t *data, uint32_t *hashval)
{
/*load the intital hash value H0 into state*/ hashState state;
/*
#define INIT(a,b,c,d) ((a) | ((b)<<8) | ((c)<<16) | ((d)<<24)) /*load the intital hash value H0 into state*/
state.x[0][0] = INIT(0x6f,0xd1,0x4b,0x96); /*
state.x[0][1] = INIT(0x3e,0x00,0xaa,0x17); #define INIT(a,b,c,d) ((a) | ((b)<<8) | ((c)<<16) | ((d)<<24))
state.x[0][2] = INIT(0x63,0x6a,0x2e,0x05); state.x[0][0] = INIT(0x6f,0xd1,0x4b,0x96);
state.x[0][3] = INIT(0x7a,0x15,0xd5,0x43); state.x[0][1] = INIT(0x3e,0x00,0xaa,0x17);
state.x[1][0] = INIT(0x8a,0x22,0x5e,0x8d); state.x[0][2] = INIT(0x63,0x6a,0x2e,0x05);
state.x[1][1] = INIT(0x0c,0x97,0xef,0x0b); state.x[0][3] = INIT(0x7a,0x15,0xd5,0x43);
state.x[1][2] = INIT(0xe9,0x34,0x12,0x59); state.x[1][0] = INIT(0x8a,0x22,0x5e,0x8d);
state.x[1][3] = INIT(0xf2,0xb3,0xc3,0x61); state.x[1][1] = INIT(0x0c,0x97,0xef,0x0b);
state.x[2][0] = INIT(0x89,0x1d,0xa0,0xc1); state.x[1][2] = INIT(0xe9,0x34,0x12,0x59);
state.x[2][1] = INIT(0x53,0x6f,0x80,0x1e); state.x[1][3] = INIT(0xf2,0xb3,0xc3,0x61);
state.x[2][2] = INIT(0x2a,0xa9,0x05,0x6b); state.x[2][0] = INIT(0x89,0x1d,0xa0,0xc1);
state.x[2][3] = INIT(0xea,0x2b,0x6d,0x80); state.x[2][1] = INIT(0x53,0x6f,0x80,0x1e);
state.x[3][0] = INIT(0x58,0x8e,0xcc,0xdb); state.x[2][2] = INIT(0x2a,0xa9,0x05,0x6b);
state.x[3][1] = INIT(0x20,0x75,0xba,0xa6); state.x[2][3] = INIT(0xea,0x2b,0x6d,0x80);
state.x[3][2] = INIT(0xa9,0x0f,0x3a,0x76); state.x[3][0] = INIT(0x58,0x8e,0xcc,0xdb);
state.x[3][3] = INIT(0xba,0xf8,0x3b,0xf7); state.x[3][1] = INIT(0x20,0x75,0xba,0xa6);
state.x[4][0] = INIT(0x01,0x69,0xe6,0x05); state.x[3][2] = INIT(0xa9,0x0f,0x3a,0x76);
state.x[4][1] = INIT(0x41,0xe3,0x4a,0x69); state.x[3][3] = INIT(0xba,0xf8,0x3b,0xf7);
state.x[4][2] = INIT(0x46,0xb5,0x8a,0x8e); state.x[4][0] = INIT(0x01,0x69,0xe6,0x05);
state.x[4][3] = INIT(0x2e,0x6f,0xe6,0x5a); state.x[4][1] = INIT(0x41,0xe3,0x4a,0x69);
state.x[5][0] = INIT(0x10,0x47,0xa7,0xd0); state.x[4][2] = INIT(0x46,0xb5,0x8a,0x8e);
state.x[5][1] = INIT(0xc1,0x84,0x3c,0x24); state.x[4][3] = INIT(0x2e,0x6f,0xe6,0x5a);
state.x[5][2] = INIT(0x3b,0x6e,0x71,0xb1); state.x[5][0] = INIT(0x10,0x47,0xa7,0xd0);
state.x[5][3] = INIT(0x2d,0x5a,0xc1,0x99); state.x[5][1] = INIT(0xc1,0x84,0x3c,0x24);
state.x[6][0] = INIT(0xcf,0x57,0xf6,0xec); state.x[5][2] = INIT(0x3b,0x6e,0x71,0xb1);
state.x[6][1] = INIT(0x9d,0xb1,0xf8,0x56); state.x[5][3] = INIT(0x2d,0x5a,0xc1,0x99);
state.x[6][2] = INIT(0xa7,0x06,0x88,0x7c); state.x[6][0] = INIT(0xcf,0x57,0xf6,0xec);
state.x[6][3] = INIT(0x57,0x16,0xb1,0x56); state.x[6][1] = INIT(0x9d,0xb1,0xf8,0x56);
state.x[7][0] = INIT(0xe3,0xc2,0xfc,0xdf); state.x[6][2] = INIT(0xa7,0x06,0x88,0x7c);
state.x[7][1] = INIT(0xe6,0x85,0x17,0xfb); state.x[6][3] = INIT(0x57,0x16,0xb1,0x56);
state.x[7][2] = INIT(0x54,0x5a,0x46,0x78); state.x[7][0] = INIT(0xe3,0xc2,0xfc,0xdf);
state.x[7][3] = INIT(0xcc,0x8c,0xdd,0x4b); state.x[7][1] = INIT(0xe6,0x85,0x17,0xfb);
*/ state.x[7][2] = INIT(0x54,0x5a,0x46,0x78);
#pragma unroll 8 state.x[7][3] = INIT(0xcc,0x8c,0xdd,0x4b);
for(int j=0;j<8;j++) */
{ #pragma unroll 8
#pragma unroll 4 for(int j=0;j<8;j++)
for(int i=0;i<4;i++) {
state.x[j][i] = c_INIT_bitslice[j][i]; #pragma unroll 4
} for(int i=0;i<4;i++)
state.x[j][i] = c_INIT_bitslice[j][i];
#pragma unroll 16 }
for (int i=0; i < 16; ++i) state.buffer[i] = data[i];
F8(&state); #pragma unroll 16
for (int i=0; i < 16; ++i) state.buffer[i] = data[i];
/*pad the message when databitlen is multiple of 512 bits, then process the padded block*/ F8(&state);
state.buffer[0] = 0x80;
#pragma unroll 14 /*pad the message when databitlen is multiple of 512 bits, then process the padded block*/
for (int i=1; i < 15; i++) state.buffer[i] = 0; state.buffer[0] = 0x80;
state.buffer[15] = 0x00020000; #pragma unroll 14
F8(&state); for (int i=1; i < 15; i++) state.buffer[i] = 0;
state.buffer[15] = 0x00020000;
/*truncating the final hash value to generate the message digest*/ F8(&state);
#pragma unroll 16
for (int i=0; i < 16; ++i) hashval[i] = state.x[4][i]; /*truncating the final hash value to generate the message digest*/
} #pragma unroll 16
for (int i=0; i < 16; ++i) hashval[i] = state.x[4][i];
// Die Hash-Funktion }
__global__ void quark_jh512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ // Die Hash-Funktion
int thread = (blockDim.x * blockIdx.x + threadIdx.x); __global__ void quark_jh512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
if (thread < threads) {
{ int thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); if (thread < threads)
{
int hashPosition = nounce - startNounce; uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
int hashPosition = nounce - startNounce;
JHHash(Hash, Hash); uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
}
} JHHash(Hash, Hash);
}
}
// Setup-Funktionen
__host__ void quark_jh512_cpu_init(int thr_id, int threads)
{ // Setup-Funktionen
__host__ void quark_jh512_cpu_init(int thr_id, int threads)
cudaMemcpyToSymbol( c_E8_bitslice_roundconstant, {
h_E8_bitslice_roundconstant,
sizeof(h_E8_bitslice_roundconstant), cudaMemcpyToSymbol( c_E8_bitslice_roundconstant,
0, cudaMemcpyHostToDevice); h_E8_bitslice_roundconstant,
sizeof(h_E8_bitslice_roundconstant),
cudaMemcpyToSymbol( c_INIT_bitslice, 0, cudaMemcpyHostToDevice);
h_INIT_bitslice,
sizeof(h_INIT_bitslice), cudaMemcpyToSymbol( c_INIT_bitslice,
0, cudaMemcpyHostToDevice); h_INIT_bitslice,
} sizeof(h_INIT_bitslice),
0, cudaMemcpyHostToDevice);
__host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) }
{
const int threadsperblock = 256; __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
// berechne wie viele Thread Blocks wir brauchen const int threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
// Größe des dynamischen Shared Memory Bereichs dim3 block(threadsperblock);
size_t shared_size = 0;
// Größe des dynamischen Shared Memory Bereichs
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); size_t shared_size = 0;
quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
MyStreamSynchronize(NULL, order, thr_id);
} quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

916
quark/cuda_quark_blake512.cu

@ -1,482 +1,434 @@
#include <cuda.h> #include <cuda.h>
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#define USE_SHUFFLE 0 #define USE_SHUFFLE 0
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t; typedef unsigned char uint8_t;
typedef unsigned int uint32_t; typedef unsigned int uint32_t;
typedef unsigned long long uint64_t; typedef unsigned long long uint64_t;
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// die Message it Padding zur Berechnung auf der GPU // die Message it Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
// ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------ // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------
__constant__ uint8_t c_sigma[16][16]; __constant__ uint8_t c_sigma[16][16];
const uint8_t host_sigma[16][16] = const uint8_t host_sigma[16][16] =
{ {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
}; };
// das Hi Word aus einem 64 Bit Typen extrahieren // das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const uint64_t &x) { static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x)); return (uint32_t)__double2hiint(__longlong_as_double(x));
#else #else
return (uint32_t)(x >> 32); return (uint32_t)(x >> 32);
#endif #endif
} }
// das Hi Word in einem 64 Bit Typen ersetzen // das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
} }
// das Lo Word aus einem 64 Bit Typen extrahieren // das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) { static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130 #if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x)); return (uint32_t)__double2loint(__longlong_as_double(x));
#else #else
return (uint32_t)(x & 0xFFFFFFFFULL); return (uint32_t)(x & 0xFFFFFFFFULL);
#endif #endif
} }
// das Lo Word in einem 64 Bit Typen ersetzen // das Lo Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
} }
/* __device__ __forceinline__ uint64_t SWAP64(uint64_t x)
#define SWAP32(x) \ {
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ // Input: 77665544 33221100
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) // Output: 00112233 44556677
uint64_t temp[2];
#define SWAP64(x) \ temp[0] = __byte_perm(HIWORD(x), 0, 0x0123);
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ temp[1] = __byte_perm(LOWORD(x), 0, 0x0123);
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ return temp[0] | (temp[1]<<32);
(((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ }
(((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ __constant__ uint64_t c_u512[16];
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56))) const uint64_t host_u512[16] =
*/ {
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
/* 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
__device__ __forceinline__ void SWAP32(uint32_t *x) 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
{ 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
// Input: 33221100 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
// Output: 00112233 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
x[0] = __byte_perm(x[0], 0, 0x0123); 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
} 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
*/ };
__device__ __forceinline__ uint64_t SWAP64(uint64_t x)
{
// Input: 77665544 33221100 // diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt
// Output: 00112233 44556677 #if __CUDA_ARCH__ >= 350
uint64_t temp[2]; __forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
temp[0] = __byte_perm(HIWORD(x), 0, 0x0123); uint2 result;
temp[1] = __byte_perm(LOWORD(x), 0, 0x0123); if(offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
return temp[0] | (temp[1]<<32); asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} } else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
__constant__ uint64_t c_u512[16]; asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
}
const uint64_t host_u512[16] = return __double_as_longlong(__hiloint2double(result.y, result.x));
{ }
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, #else
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, #define ROTR(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, #endif
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, #define G(a,b,c,d,e) \
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, v[d] = ROTR( v[d] ^ v[a],32); \
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL v[c] += v[d]; \
}; v[b] = ROTR( v[b] ^ v[c],25); \
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \
v[d] = ROTR( v[d] ^ v[a],16); \
// diese 64 Bit Rotates werden unter Compute 3.5 (und besser) mit dem Funnel Shifter beschleunigt v[c] += v[d]; \
#if __CUDA_ARCH__ >= 350 v[b] = ROTR( v[b] ^ v[c],11);
__forceinline__ __device__ uint64_t ROTR(const uint64_t value, const int offset) {
uint2 result;
if(offset < 32) { __device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); uint64_t v[16], m[16], i;
} else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); #pragma unroll 16
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); for( i = 0; i < 16; ++i )
} {
return __double_as_longlong(__hiloint2double(result.y, result.x)); m[i] = SWAP64(block[i]);
} }
#else
#define ROTR(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) #pragma unroll 8
#endif for( i = 0; i < 8; ++i ) v[i] = h[i];
#define G(a,b,c,d,e) \ v[ 8] = u512[0];
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ v[ 9] = u512[1];
v[d] = ROTR( v[d] ^ v[a],32); \ v[10] = u512[2];
v[c] += v[d]; \ v[11] = u512[3];
v[b] = ROTR( v[b] ^ v[c],25); \ v[12] = u512[4];
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ v[13] = u512[5];
v[d] = ROTR( v[d] ^ v[a],16); \ v[14] = u512[6];
v[c] += v[d]; \ v[15] = u512[7];
v[b] = ROTR( v[b] ^ v[c],11);
v[12] ^= bits;
v[13] ^= bits;
__device__ void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
{ //#pragma unroll 16
uint64_t v[16], m[16], i; for( i = 0; i < 16; ++i )
{
#pragma unroll 16 /* column step */
for( i = 0; i < 16; ++i ) G( 0, 4, 8, 12, 0 );
{ G( 1, 5, 9, 13, 2 );
m[i] = SWAP64(block[i]); G( 2, 6, 10, 14, 4 );
} G( 3, 7, 11, 15, 6 );
/* diagonal step */
#pragma unroll 8 G( 0, 5, 10, 15, 8 );
for( i = 0; i < 8; ++i ) v[i] = h[i]; G( 1, 6, 11, 12, 10 );
G( 2, 7, 8, 13, 12 );
v[ 8] = u512[0]; G( 3, 4, 9, 14, 14 );
v[ 9] = u512[1]; }
v[10] = u512[2];
v[11] = u512[3]; #pragma unroll 16
v[12] = u512[4]; for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
v[13] = u512[5]; }
v[14] = u512[6];
v[15] = u512[7]; // Endian Drehung für 32 Bit Typen
v[12] ^= bits; static __device__ uint32_t cuda_swab32(uint32_t x)
v[13] ^= bits; {
return __byte_perm(x, 0, 0x0123);
//#pragma unroll 16 }
for( i = 0; i < 16; ++i )
{ /*
/* column step */ // Endian Drehung für 64 Bit Typen
G( 0, 4, 8, 12, 0 ); static __device__ uint64_t cuda_swab64(uint64_t x) {
G( 1, 5, 9, 13, 2 ); uint32_t h = (x >> 32);
G( 2, 6, 10, 14, 4 ); uint32_t l = (x & 0xFFFFFFFFULL);
G( 3, 7, 11, 15, 6 ); return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
/* diagonal step */ }
G( 0, 5, 10, 15, 8 ); */
G( 1, 6, 11, 12, 10 );
G( 2, 7, 8, 13, 12 ); static __constant__ uint64_t d_constMem[8];
G( 3, 4, 9, 14, 14 ); static const uint64_t h_constMem[8] = {
} 0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
#pragma unroll 16 0x3c6ef372fe94f82bULL,
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; 0xa54ff53a5f1d36f1ULL,
} 0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
// Endian Drehung für 32 Bit Typen 0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL };
static __device__ uint32_t cuda_swab32(uint32_t x)
{ // Hash-Padding
return __byte_perm(x, 0, 0x0123); static __constant__ uint64_t d_constHashPadding[8];
/* static const uint64_t h_constHashPadding[8] = {
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) 0x0000000000000080ull,
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); 0,
*/ 0,
} 0,
/* 0,
// Endian Drehung für 64 Bit Typen 0x0100000000000000ull,
static __device__ uint64_t cuda_swab64(uint64_t x) { 0,
uint32_t h = (x >> 32); 0x0002000000000000ull };
uint32_t l = (x & 0xFFFFFFFFULL);
return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h)); __global__ __launch_bounds__(256, 2) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
} {
*/ int thread = (blockDim.x * blockIdx.x + threadIdx.x);
static __constant__ uint64_t d_constMem[8]; #if USE_SHUFFLE
static const uint64_t h_constMem[8] = { const int warpID = threadIdx.x & 0x0F; // 16 warps
0x6a09e667f3bcc908ULL, const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke
0xbb67ae8584caa73bULL, const int maxHashPosition = thread<<3;
0x3c6ef372fe94f82bULL, #endif
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL, #if USE_SHUFFLE
0x9b05688c2b3e6c1fULL, if (warpBlockID < ( (threads+15)>>4 ))
0x1f83d9abfb41bd6bULL, #else
0x5be0cd19137e2179ULL }; if (thread < threads)
#endif
// Hash-Padding {
static __constant__ uint64_t d_constHashPadding[8]; // bestimme den aktuellen Zähler
static const uint64_t h_constHashPadding[8] = { uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
0x0000000000000080ull,
0, int hashPosition = nounce - startNounce;
0, //uint64_t *inpHash = &g_hash[8 * hashPosition];
0, uint64_t *inpHash = &g_hash[hashPosition<<3];
0,
0x0100000000000000ull, // State vorbereiten
0, uint64_t h[8];
0x0002000000000000ull }; /*
h[0] = 0x6a09e667f3bcc908ULL;
__global__ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) h[1] = 0xbb67ae8584caa73bULL;
{ h[2] = 0x3c6ef372fe94f82bULL;
int thread = (blockDim.x * blockIdx.x + threadIdx.x); h[3] = 0xa54ff53a5f1d36f1ULL;
h[4] = 0x510e527fade682d1ULL;
#if USE_SHUFFLE h[5] = 0x9b05688c2b3e6c1fULL;
const int warpID = threadIdx.x & 0x0F; // 16 warps h[6] = 0x1f83d9abfb41bd6bULL;
const int warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke h[7] = 0x5be0cd19137e2179ULL;
const int maxHashPosition = thread<<3; */
#endif #pragma unroll 8
for(int i=0;i<8;i++)
#if USE_SHUFFLE h[i] = d_constMem[i];
if (warpBlockID < ( (threads+15)>>4 ))
#else // 128 Byte für die Message
if (thread < threads) uint64_t buf[16];
#endif
{ // Message für die erste Runde in Register holen
// bestimme den aktuellen Zähler #pragma unroll 8
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); for (int i=0; i < 8; ++i) buf[i] = inpHash[i];
int hashPosition = nounce - startNounce; /*
//uint64_t *inpHash = &g_hash[8 * hashPosition]; buf[ 8] = 0x0000000000000080ull;
uint64_t *inpHash = &g_hash[hashPosition<<3]; buf[ 9] = 0;
buf[10] = 0;
// State vorbereiten buf[11] = 0;
uint64_t h[8]; buf[12] = 0;
/* buf[13] = 0x0100000000000000ull;
h[0] = 0x6a09e667f3bcc908ULL; buf[14] = 0;
h[1] = 0xbb67ae8584caa73bULL; buf[15] = 0x0002000000000000ull;
h[2] = 0x3c6ef372fe94f82bULL; */
h[3] = 0xa54ff53a5f1d36f1ULL; #pragma unroll 8
h[4] = 0x510e527fade682d1ULL; for(int i=0;i<8;i++)
h[5] = 0x9b05688c2b3e6c1fULL; buf[i+8] = d_constHashPadding[i];
h[6] = 0x1f83d9abfb41bd6bULL;
h[7] = 0x5be0cd19137e2179ULL; // die einzige Hashing-Runde
*/ quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
#pragma unroll 8
for(int i=0;i<8;i++) // Hash rauslassen
h[i] = d_constMem[i]; #if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
// 128 Byte für die Message uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
uint64_t buf[16]; #pragma unroll 8
for (int i=0; i < 8; ++i) {
// Message für die erste Runde in Register holen outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
#pragma unroll 8 outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
for (int i=0; i < 8; ++i) buf[i] = inpHash[i]; }
#else
/* // in dieser Version passieren auch ein paar 64 Bit Shifts
buf[ 8] = 0x0000000000000080ull; uint64_t *outHash = &g_hash[8 * hashPosition];
buf[ 9] = 0; #pragma unroll 8
buf[10] = 0; for (int i=0; i < 8; ++i)
buf[11] = 0; {
buf[12] = 0; //outHash[i] = cuda_swab64( h[i] );
buf[13] = 0x0100000000000000ull; outHash[i] = SWAP64(h[i]);
buf[14] = 0; }
buf[15] = 0x0002000000000000ull; #endif
*/ }
#pragma unroll 8 }
for(int i=0;i<8;i++)
buf[i+8] = d_constHashPadding[i]; __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
{
// die einzige Hashing-Runde int thread = (blockDim.x * blockIdx.x + threadIdx.x);
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 ); if (thread < threads)
{
// Hash rauslassen // bestimme den aktuellen Zähler
#if __CUDA_ARCH__ >= 130 uint32_t nounce = startNounce + thread;
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition]; // State vorbereiten
#pragma unroll 8 uint64_t h[8];
for (int i=0; i < 8; ++i) { /*
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); h[0] = 0x6a09e667f3bcc908ULL;
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); h[1] = 0xbb67ae8584caa73bULL;
} h[2] = 0x3c6ef372fe94f82bULL;
#else h[3] = 0xa54ff53a5f1d36f1ULL;
// in dieser Version passieren auch ein paar 64 Bit Shifts h[4] = 0x510e527fade682d1ULL;
uint64_t *outHash = &g_hash[8 * hashPosition]; h[5] = 0x9b05688c2b3e6c1fULL;
#pragma unroll 8 h[6] = 0x1f83d9abfb41bd6bULL;
for (int i=0; i < 8; ++i) h[7] = 0x5be0cd19137e2179ULL;
{ */
//outHash[i] = cuda_swab64( h[i] ); #pragma unroll 8
outHash[i] = SWAP64(h[i]); for(int i=0;i<8;i++)
} h[i] = d_constMem[i];
#endif // 128 Byte für die Message
} uint64_t buf[16];
}
// Message für die erste Runde in Register holen
__global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash) #pragma unroll 16
{ for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) // die Nounce durch die thread-spezifische ersetzen
{ buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread; // die einzige Hashing-Runde
quark_blake512_compress( h, buf, c_sigma, c_u512, 640 );
// State vorbereiten
uint64_t h[8]; // Hash rauslassen
/* #if __CUDA_ARCH__ >= 130
h[0] = 0x6a09e667f3bcc908ULL; // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
h[1] = 0xbb67ae8584caa73bULL; uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
h[2] = 0x3c6ef372fe94f82bULL; #pragma unroll 8
h[3] = 0xa54ff53a5f1d36f1ULL; for (int i=0; i < 8; ++i) {
h[4] = 0x510e527fade682d1ULL; outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
h[5] = 0x9b05688c2b3e6c1fULL; outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
h[6] = 0x1f83d9abfb41bd6bULL; }
h[7] = 0x5be0cd19137e2179ULL; #else
*/ // in dieser Version passieren auch ein paar 64 Bit Shifts
#pragma unroll 8 uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
for(int i=0;i<8;i++) #pragma unroll 8
h[i] = d_constMem[i]; for (int i=0; i < 8; ++i)
// 128 Byte für die Message {
uint64_t buf[16]; //outHash[i] = cuda_swab64( h[i] );
outHash[i] = SWAP64(h[i]);
// Message für die erste Runde in Register holen }
#pragma unroll 16 #endif
for (int i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i]; }
}
// die Nounce durch die thread-spezifische ersetzen
buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
// ---------------------------- END CUDA quark_blake512 functions ------------------------------------
// die einzige Hashing-Runde
quark_blake512_compress( h, buf, c_sigma, c_u512, 640 ); // Setup-Funktionen
__host__ void quark_blake512_cpu_init(int thr_id, int threads)
// Hash rauslassen {
#if __CUDA_ARCH__ >= 130 // Kopiere die Hash-Tabellen in den GPU-Speicher
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind cudaMemcpyToSymbol( c_sigma,
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; host_sigma,
#pragma unroll 8 sizeof(host_sigma),
for (int i=0; i < 8; ++i) { 0, cudaMemcpyHostToDevice);
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); cudaMemcpyToSymbol( c_u512,
} host_u512,
#else sizeof(host_u512),
// in dieser Version passieren auch ein paar 64 Bit Shifts 0, cudaMemcpyHostToDevice);
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
#pragma unroll 8 cudaMemcpyToSymbol( d_constMem,
for (int i=0; i < 8; ++i) h_constMem,
{ sizeof(h_constMem),
//outHash[i] = cuda_swab64( h[i] ); 0, cudaMemcpyHostToDevice);
outHash[i] = SWAP64(h[i]);
} cudaMemcpyToSymbol( d_constHashPadding,
#endif h_constHashPadding,
} sizeof(h_constHashPadding),
} 0, cudaMemcpyHostToDevice);
}
// ---------------------------- END CUDA quark_blake512 functions ------------------------------------ // Blake512 für 80 Byte grosse Eingangsdaten
__host__ void quark_blake512_cpu_setBlock_80(void *pdata)
// Setup-Funktionen {
__host__ void quark_blake512_cpu_init(int thr_id, int threads) // Message mit Padding bereitstellen
{ // lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
// Kopiere die Hash-Tabellen in den GPU-Speicher unsigned char PaddedMessage[128];
cudaMemcpyToSymbol( c_sigma, memcpy(PaddedMessage, pdata, 80);
host_sigma, memset(PaddedMessage+80, 0, 48);
sizeof(host_sigma), PaddedMessage[80] = 0x80;
0, cudaMemcpyHostToDevice); PaddedMessage[111] = 1;
PaddedMessage[126] = 0x02;
cudaMemcpyToSymbol( c_u512, PaddedMessage[127] = 0x80;
host_u512,
sizeof(host_u512), // die Message zur Berechnung auf der GPU
0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}
cudaMemcpyToSymbol( d_constMem,
h_constMem, __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
sizeof(h_constMem), {
0, cudaMemcpyHostToDevice); const int threadsperblock = 256;
cudaMemcpyToSymbol( d_constHashPadding, // berechne wie viele Thread Blocks wir brauchen
h_constHashPadding, dim3 grid((threads + threadsperblock-1)/threadsperblock);
sizeof(h_constHashPadding), dim3 block(threadsperblock);
0, cudaMemcpyHostToDevice);
} // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// Blake512 für 80 Byte grosse Eingangsdaten
__host__ void quark_blake512_cpu_setBlock_80(void *pdata) // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
{
// Message mit Padding bereitstellen quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
unsigned char PaddedMessage[128]; // Strategisches Sleep Kommando zur Senkung der CPU Last
memcpy(PaddedMessage, pdata, 80); MyStreamSynchronize(NULL, order, thr_id);
memset(PaddedMessage+80, 0, 48); }
PaddedMessage[80] = 0x80;
PaddedMessage[111] = 1; __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
PaddedMessage[126] = 0x02; {
PaddedMessage[127] = 0x80; const int threadsperblock = 256;
// die Message zur Berechnung auf der GPU // berechne wie viele Thread Blocks wir brauchen
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); dim3 grid((threads + threadsperblock-1)/threadsperblock);
} dim3 block(threadsperblock);
#if 0 // Größe des dynamischen Shared Memory Bereichs
// Blake512 für 64 Byte grosse Eingangsdaten size_t shared_size = 0;
// evtl. macht es gar keinen Sinn, das alles ins Constant Memory to schicken. Es sind hier sowieso
// nur die letzten 64 Bytes des Blocks konstant, und die meisten Bytes davon sind 0. Das kann mnan // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
// auch im Kernel initialisieren.
__host__ void quark_blake512_cpu_setBlock_64(void *pdata) quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
{
// Message mit Padding bereitstellen // Strategisches Sleep Kommando zur Senkung der CPU Last
unsigned char PaddedMessage[128]; MyStreamSynchronize(NULL, order, thr_id);
memcpy(PaddedMessage, pdata, 64); // Hinweis: diese 64 Bytes sind nonce-spezifisch und ändern sich KOMPLETT für jede Nonce! }
memset(PaddedMessage+64, 0, 64);
PaddedMessage[64] = 0x80;
PaddedMessage[111] = 1;
PaddedMessage[126] = 0x02;
PaddedMessage[127] = 0x00;
// die Message zur Berechnung auf der GPU
cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}
#endif
__host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id);
}
__host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id);
}

363
quark/cuda_quark_compactionTest.cu

@ -0,0 +1,363 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_30_intrinsics.h"
#include <stdio.h>
#include <memory.h>
#include <stdint.h>
// aus cpu-miner.c
extern "C" int device_map[8];
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
static uint32_t *d_tempBranch1Nonces[8];
static uint32_t *d_numValid[8];
static uint32_t *h_numValid[8];
static uint32_t *d_partSum[2][8]; // für bis zu vier partielle Summen
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// True/False tester
typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash);
__device__ uint32_t QuarkTrueTest(uint32_t *inpHash)
{
return ((inpHash[0] & 0x08) == 0x08);
}
__device__ uint32_t QuarkFalseTest(uint32_t *inpHash)
{
return ((inpHash[0] & 0x08) == 0);
}
__device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest;
cuda_compactTestFunction_t h_QuarkTrueFunction[8], h_QuarkFalseFunction[8];
// Setup-Funktionen
__host__ void quark_compactTest_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t));
cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t));
// wir brauchen auch Speicherplatz auf dem Device
cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2);
cudaMalloc(&d_numValid[thr_id], 2*sizeof(uint32_t));
cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t));
uint32_t s1;
s1 = (threads / 256) * 2;
cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
}
// Die Summenfunktion (vom NVIDIA SDK)
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
extern __shared__ uint32_t sums[];
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
//int lane_id = id % warpSize;
int lane_id = id % width;
// determine a warp_id within a block
//int warp_id = threadIdx.x / warpSize;
int warp_id = threadIdx.x / width;
sums[lane_id] = 0;
// Below is the basic structure of using a shfl instruction
// for a scan.
// Record "value" as a variable - we accumulate it along the way
uint32_t value;
if(testFunc != NULL)
{
if (id < threads)
{
uint32_t *inpHash;
if(d_validNonceTable == NULL)
{
// keine Nonce-Liste
inpHash = &inpHashes[id<<4];
}else
{
// Nonce-Liste verfügbar
int nonce = d_validNonceTable[id] - startNounce;
inpHash = &inpHashes[nonce<<4];
}
value = (*testFunc)(inpHash);
}else
{
value = 0;
}
}else
{
value = data[id];
}
__syncthreads();
// Now accumulate in log steps up the chain
// compute sums, with another thread's value who is
// distance delta away (i). Note
// those threads where the thread 'i' away would have
// been out of bounds of the warp are unaffected. This
// creates the scan sum.
#pragma unroll
for (int i=1; i<=width; i*=2)
{
uint32_t n = __shfl_up((int)value, i, width);
if (lane_id >= i) value += n;
}
// value now holds the scan value for the individual thread
// next sum the largest values for each warp
// write the sum of the warp to smem
//if (threadIdx.x % warpSize == warpSize-1)
if (threadIdx.x % width == width-1)
{
sums[warp_id] = value;
}
__syncthreads();
//
// scan sum the warp sums
// the same shfl scan operation, but performed on warp sums
//
if (warp_id == 0)
{
uint32_t warp_sum = sums[lane_id];
for (int i=1; i<=width; i*=2)
{
uint32_t n = __shfl_up((int)warp_sum, i, width);
if (lane_id >= i) warp_sum += n;
}
sums[lane_id] = warp_sum;
}
__syncthreads();
// perform a uniform add across warps in the block
// read neighbouring warp's sum and add it to threads value
uint32_t blockSum = 0;
if (warp_id > 0)
{
blockSum = sums[warp_id-1];
}
value += blockSum;
// Now write out our result
data[id] = value;
// last thread has sum, write write out the block's sum
if (partial_sums != NULL && threadIdx.x == blockDim.x-1)
{
partial_sums[blockIdx.x] = value;
}
}
// Uniform add: add partial sums array
__global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len)
{
__shared__ uint32_t buf;
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
if (id > len) return;
if (threadIdx.x == 0)
{
buf = partial_sums[blockIdx.x];
}
__syncthreads();
data[id] += buf;
}
// Der Scatter
__global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{
int id = ((blockIdx.x * blockDim.x) + threadIdx.x);
uint32_t actNounce = id;
uint32_t value;
if (id < threads)
{
// uint32_t nounce = startNounce + id;
uint32_t *inpHash;
if(d_validNonceTable == NULL)
{
// keine Nonce-Liste
inpHash = &inpHashes[id<<4];
}else
{
// Nonce-Liste verfügbar
int nonce = d_validNonceTable[id] - startNounce;
actNounce = nonce;
inpHash = &inpHashes[nonce<<4];
}
value = (*testFunc)(inpHash);
}else
{
value = 0;
}
if( value )
{
int idx = sum[id];
if(idx > 0)
outp[idx-1] = startNounce + actNounce;
}
}
__host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val)
{
if(val == 0)
return 0;
uint32_t mask = 0x80000000;
while( (val & mask) == 0 ) mask = mask >> 1;
if( (val & (~mask)) != 0 )
return mask << 1;
return mask;
}
__host__ void quark_compactTest_cpu_singleCompaction(int thr_id, int threads, uint32_t *nrm,
uint32_t *d_nonces1, cuda_compactTestFunction_t function,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{
int orgThreads = threads;
threads = (int)quark_compactTest_roundUpExp((uint32_t)threads);
// threadsPerBlock ausrechnen
int blockSize = 256;
int nSummen = threads / blockSize;
int thr1 = (threads+blockSize-1) / blockSize;
int thr2 = threads / (blockSize*blockSize);
int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize;
int thr3 = (nSummen + blockSize2-1) / blockSize2;
bool callThrid = (thr2 > 0) ? true : false;
// Erster Initialscan
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(
d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// weitere Scans
if(callThrid)
{
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2);
}else
{
quark_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
}
// Sync + Anzahl merken
cudaStreamSynchronize(NULL);
if(callThrid)
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
else
cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
// Addieren
if(callThrid)
{
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2);
}
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads);
// Scatter
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1,
function, orgThreads, startNounce, inpHashes, d_validNonceTable);
// Sync
cudaStreamSynchronize(NULL);
}
////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
__host__ void quark_compactTest_cpu_dualCompaction(int thr_id, int threads, uint32_t *nrm,
uint32_t *d_nonces1, uint32_t *d_nonces2,
uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable)
{
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
/*
// threadsPerBlock ausrechnen
int blockSize = 256;
int thr1 = threads / blockSize;
int thr2 = threads / (blockSize*blockSize);
// 1
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
cudaStreamSynchronize(NULL);
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
// 2
quark_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
quark_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
cudaStreamSynchronize(NULL);
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
quark_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
quark_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
// Hier ist noch eine Besonderheit: in d_tempBranch1Nonces sind die element von 1...nrm1 die Interessanten
// Schritt 3: Scatter
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_QuarkTrueFunction[thr_id], threads, startNounce, inpHashes);
quark_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_QuarkFalseFunction[thr_id], threads, startNounce, inpHashes);
cudaStreamSynchronize(NULL);
*/
}
__host__ void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
quark_compactTest_cpu_dualCompaction(thr_id, threads,
h_numValid[thr_id], d_nonces1, d_nonces2,
startNounce, inpHashes, d_validNonceTable);
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
*nrm1 = (size_t)h_numValid[thr_id][0];
*nrm2 = (size_t)h_numValid[thr_id][1];
}
__host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
int order)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable);
cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
*nrm1 = (size_t)h_numValid[thr_id][0];
}

182
quark/cuda_quark_keccak512.cu

@ -0,0 +1,182 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#include "cuda_helper.h"
#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
#define U64TO32_LE(p, v) \
*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
static const uint64_t host_keccak_round_constants[24] = {
0x0000000000000001ull, 0x0000000000008082ull,
0x800000000000808aull, 0x8000000080008000ull,
0x000000000000808bull, 0x0000000080000001ull,
0x8000000080008081ull, 0x8000000000008009ull,
0x000000000000008aull, 0x0000000000000088ull,
0x0000000080008009ull, 0x000000008000000aull,
0x000000008000808bull, 0x800000000000008bull,
0x8000000000008089ull, 0x8000000000008003ull,
0x8000000000008002ull, 0x8000000000000080ull,
0x000000000000800aull, 0x800000008000000aull,
0x8000000080008081ull, 0x8000000000008080ull,
0x0000000080000001ull, 0x8000000080008008ull
};
__constant__ uint64_t c_keccak_round_constants[24];
static __device__ __forceinline__ void
keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
size_t i;
uint64_t t[5], u[5], v, w;
/* absorb input */
#pragma unroll 9
for (i = 0; i < 72 / 8; i++, in += 2)
s[i] ^= U32TO64_LE(in);
for (i = 0; i < 24; i++) {
/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
u[0] = t[4] ^ ROTL64(t[1], 1);
u[1] = t[0] ^ ROTL64(t[2], 1);
u[2] = t[1] ^ ROTL64(t[3], 1);
u[3] = t[2] ^ ROTL64(t[4], 1);
u[4] = t[3] ^ ROTL64(t[0], 1);
/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
/* rho pi: b[..] = rotl(a[..], ..) */
v = s[ 1];
s[ 1] = ROTL64(s[ 6], 44);
s[ 6] = ROTL64(s[ 9], 20);
s[ 9] = ROTL64(s[22], 61);
s[22] = ROTL64(s[14], 39);
s[14] = ROTL64(s[20], 18);
s[20] = ROTL64(s[ 2], 62);
s[ 2] = ROTL64(s[12], 43);
s[12] = ROTL64(s[13], 25);
s[13] = ROTL64(s[19], 8);
s[19] = ROTL64(s[23], 56);
s[23] = ROTL64(s[15], 41);
s[15] = ROTL64(s[ 4], 27);
s[ 4] = ROTL64(s[24], 14);
s[24] = ROTL64(s[21], 2);
s[21] = ROTL64(s[ 8], 55);
s[ 8] = ROTL64(s[16], 45);
s[16] = ROTL64(s[ 5], 36);
s[ 5] = ROTL64(s[ 3], 28);
s[ 3] = ROTL64(s[18], 21);
s[18] = ROTL64(s[17], 15);
s[17] = ROTL64(s[11], 10);
s[11] = ROTL64(s[ 7], 6);
s[ 7] = ROTL64(s[10], 3);
s[10] = ROTL64( v, 1);
/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
/* iota: a[0,0] ^= round constant */
s[0] ^= keccak_round_constants[i];
}
}
__global__ void quark_keccak512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition];
// Nachricht kopieren
uint32_t message[18];
#pragma unroll 16
for(int i=0;i<16;i++)
message[i] = inpHash[i];
message[16] = 0x01;
message[17] = 0x80000000;
// State initialisieren
uint64_t keccak_gpu_state[25];
#pragma unroll 25
for (int i=0; i<25; i++)
keccak_gpu_state[i] = 0;
// den Block einmal gut durchschütteln
keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
// das Hash erzeugen
uint32_t hash[16];
#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
}
// fertig
uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 16
for(int i=0;i<16;i++)
outpHash[i] = hash[i];
}
}
// Setup-Funktionen
__host__ void quark_keccak512_cpu_init(int thr_id, int threads)
{
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( c_keccak_round_constants,
host_keccak_round_constants,
sizeof(host_keccak_round_constants),
0, cudaMemcpyHostToDevice);
}
__host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

900
quark/cuda_skein512.cu

@ -1,450 +1,450 @@
#include <cuda.h> #include <cuda.h>
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t; typedef unsigned char uint8_t;
typedef unsigned int uint32_t; typedef unsigned int uint32_t;
typedef unsigned long long uint64_t; typedef unsigned long long uint64_t;
#define SPH_C64(x) ((uint64_t)(x ## ULL)) #define SPH_C64(x) ((uint64_t)(x ## ULL))
// aus cpu-miner.c // aus cpu-miner.c
extern "C" extern int device_map[8]; extern "C" extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Take a look at: https://www.schneier.com/skein1.3.pdf // Take a look at: https://www.schneier.com/skein1.3.pdf
#if __CUDA_ARCH__ >= 350 #if __CUDA_ARCH__ >= 350
__forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) { __forceinline__ __device__ uint64_t ROTL64(const uint64_t value, const int offset) {
uint2 result; uint2 result;
if(offset >= 32) { if(offset >= 32) {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
} else { } else {
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset));
asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset));
} }
return __double_as_longlong(__hiloint2double(result.y, result.x)); return __double_as_longlong(__hiloint2double(result.y, result.x));
} }
#else #else
#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) #define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#endif #endif
#define SHL(x, n) ((x) << (n)) #define SHL(x, n) ((x) << (n))
#define SHR(x, n) ((x) >> (n)) #define SHR(x, n) ((x) >> (n))
// Zum testen Hostcode... // Zum testen Hostcode...
/* Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll. /* Hier erstmal die Tabelle mit den Konstanten für die Mix-Funktion. Kann später vll.
mal direkt in den Code eingesetzt werden mal direkt in den Code eingesetzt werden
*/ */
/* /*
* M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7).
*/ */
#define M9_0_0 0 #define M9_0_0 0
#define M9_0_1 1 #define M9_0_1 1
#define M9_0_2 2 #define M9_0_2 2
#define M9_0_3 3 #define M9_0_3 3
#define M9_0_4 4 #define M9_0_4 4
#define M9_0_5 5 #define M9_0_5 5
#define M9_0_6 6 #define M9_0_6 6
#define M9_0_7 7 #define M9_0_7 7
#define M9_1_0 1 #define M9_1_0 1
#define M9_1_1 2 #define M9_1_1 2
#define M9_1_2 3 #define M9_1_2 3
#define M9_1_3 4 #define M9_1_3 4
#define M9_1_4 5 #define M9_1_4 5
#define M9_1_5 6 #define M9_1_5 6
#define M9_1_6 7 #define M9_1_6 7
#define M9_1_7 8 #define M9_1_7 8
#define M9_2_0 2 #define M9_2_0 2
#define M9_2_1 3 #define M9_2_1 3
#define M9_2_2 4 #define M9_2_2 4
#define M9_2_3 5 #define M9_2_3 5
#define M9_2_4 6 #define M9_2_4 6
#define M9_2_5 7 #define M9_2_5 7
#define M9_2_6 8 #define M9_2_6 8
#define M9_2_7 0 #define M9_2_7 0
#define M9_3_0 3 #define M9_3_0 3
#define M9_3_1 4 #define M9_3_1 4
#define M9_3_2 5 #define M9_3_2 5
#define M9_3_3 6 #define M9_3_3 6
#define M9_3_4 7 #define M9_3_4 7
#define M9_3_5 8 #define M9_3_5 8
#define M9_3_6 0 #define M9_3_6 0
#define M9_3_7 1 #define M9_3_7 1
#define M9_4_0 4 #define M9_4_0 4
#define M9_4_1 5 #define M9_4_1 5
#define M9_4_2 6 #define M9_4_2 6
#define M9_4_3 7 #define M9_4_3 7
#define M9_4_4 8 #define M9_4_4 8
#define M9_4_5 0 #define M9_4_5 0
#define M9_4_6 1 #define M9_4_6 1
#define M9_4_7 2 #define M9_4_7 2
#define M9_5_0 5 #define M9_5_0 5
#define M9_5_1 6 #define M9_5_1 6
#define M9_5_2 7 #define M9_5_2 7
#define M9_5_3 8 #define M9_5_3 8
#define M9_5_4 0 #define M9_5_4 0
#define M9_5_5 1 #define M9_5_5 1
#define M9_5_6 2 #define M9_5_6 2
#define M9_5_7 3 #define M9_5_7 3
#define M9_6_0 6 #define M9_6_0 6
#define M9_6_1 7 #define M9_6_1 7
#define M9_6_2 8 #define M9_6_2 8
#define M9_6_3 0 #define M9_6_3 0
#define M9_6_4 1 #define M9_6_4 1
#define M9_6_5 2 #define M9_6_5 2
#define M9_6_6 3 #define M9_6_6 3
#define M9_6_7 4 #define M9_6_7 4
#define M9_7_0 7 #define M9_7_0 7
#define M9_7_1 8 #define M9_7_1 8
#define M9_7_2 0 #define M9_7_2 0
#define M9_7_3 1 #define M9_7_3 1
#define M9_7_4 2 #define M9_7_4 2
#define M9_7_5 3 #define M9_7_5 3
#define M9_7_6 4 #define M9_7_6 4
#define M9_7_7 5 #define M9_7_7 5
#define M9_8_0 8 #define M9_8_0 8
#define M9_8_1 0 #define M9_8_1 0
#define M9_8_2 1 #define M9_8_2 1
#define M9_8_3 2 #define M9_8_3 2
#define M9_8_4 3 #define M9_8_4 3
#define M9_8_5 4 #define M9_8_5 4
#define M9_8_6 5 #define M9_8_6 5
#define M9_8_7 6 #define M9_8_7 6
#define M9_9_0 0 #define M9_9_0 0
#define M9_9_1 1 #define M9_9_1 1
#define M9_9_2 2 #define M9_9_2 2
#define M9_9_3 3 #define M9_9_3 3
#define M9_9_4 4 #define M9_9_4 4
#define M9_9_5 5 #define M9_9_5 5
#define M9_9_6 6 #define M9_9_6 6
#define M9_9_7 7 #define M9_9_7 7
#define M9_10_0 1 #define M9_10_0 1
#define M9_10_1 2 #define M9_10_1 2
#define M9_10_2 3 #define M9_10_2 3
#define M9_10_3 4 #define M9_10_3 4
#define M9_10_4 5 #define M9_10_4 5
#define M9_10_5 6 #define M9_10_5 6
#define M9_10_6 7 #define M9_10_6 7
#define M9_10_7 8 #define M9_10_7 8
#define M9_11_0 2 #define M9_11_0 2
#define M9_11_1 3 #define M9_11_1 3
#define M9_11_2 4 #define M9_11_2 4
#define M9_11_3 5 #define M9_11_3 5
#define M9_11_4 6 #define M9_11_4 6
#define M9_11_5 7 #define M9_11_5 7
#define M9_11_6 8 #define M9_11_6 8
#define M9_11_7 0 #define M9_11_7 0
#define M9_12_0 3 #define M9_12_0 3
#define M9_12_1 4 #define M9_12_1 4
#define M9_12_2 5 #define M9_12_2 5
#define M9_12_3 6 #define M9_12_3 6
#define M9_12_4 7 #define M9_12_4 7
#define M9_12_5 8 #define M9_12_5 8
#define M9_12_6 0 #define M9_12_6 0
#define M9_12_7 1 #define M9_12_7 1
#define M9_13_0 4 #define M9_13_0 4
#define M9_13_1 5 #define M9_13_1 5
#define M9_13_2 6 #define M9_13_2 6
#define M9_13_3 7 #define M9_13_3 7
#define M9_13_4 8 #define M9_13_4 8
#define M9_13_5 0 #define M9_13_5 0
#define M9_13_6 1 #define M9_13_6 1
#define M9_13_7 2 #define M9_13_7 2
#define M9_14_0 5 #define M9_14_0 5
#define M9_14_1 6 #define M9_14_1 6
#define M9_14_2 7 #define M9_14_2 7
#define M9_14_3 8 #define M9_14_3 8
#define M9_14_4 0 #define M9_14_4 0
#define M9_14_5 1 #define M9_14_5 1
#define M9_14_6 2 #define M9_14_6 2
#define M9_14_7 3 #define M9_14_7 3
#define M9_15_0 6 #define M9_15_0 6
#define M9_15_1 7 #define M9_15_1 7
#define M9_15_2 8 #define M9_15_2 8
#define M9_15_3 0 #define M9_15_3 0
#define M9_15_4 1 #define M9_15_4 1
#define M9_15_5 2 #define M9_15_5 2
#define M9_15_6 3 #define M9_15_6 3
#define M9_15_7 4 #define M9_15_7 4
#define M9_16_0 7 #define M9_16_0 7
#define M9_16_1 8 #define M9_16_1 8
#define M9_16_2 0 #define M9_16_2 0
#define M9_16_3 1 #define M9_16_3 1
#define M9_16_4 2 #define M9_16_4 2
#define M9_16_5 3 #define M9_16_5 3
#define M9_16_6 4 #define M9_16_6 4
#define M9_16_7 5 #define M9_16_7 5
#define M9_17_0 8 #define M9_17_0 8
#define M9_17_1 0 #define M9_17_1 0
#define M9_17_2 1 #define M9_17_2 1
#define M9_17_3 2 #define M9_17_3 2
#define M9_17_4 3 #define M9_17_4 3
#define M9_17_5 4 #define M9_17_5 4
#define M9_17_6 5 #define M9_17_6 5
#define M9_17_7 6 #define M9_17_7 6
#define M9_18_0 0 #define M9_18_0 0
#define M9_18_1 1 #define M9_18_1 1
#define M9_18_2 2 #define M9_18_2 2
#define M9_18_3 3 #define M9_18_3 3
#define M9_18_4 4 #define M9_18_4 4
#define M9_18_5 5 #define M9_18_5 5
#define M9_18_6 6 #define M9_18_6 6
#define M9_18_7 7 #define M9_18_7 7
/* /*
* M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1).
*/ */
#define M3_0_0 0 #define M3_0_0 0
#define M3_0_1 1 #define M3_0_1 1
#define M3_1_0 1 #define M3_1_0 1
#define M3_1_1 2 #define M3_1_1 2
#define M3_2_0 2 #define M3_2_0 2
#define M3_2_1 0 #define M3_2_1 0
#define M3_3_0 0 #define M3_3_0 0
#define M3_3_1 1 #define M3_3_1 1
#define M3_4_0 1 #define M3_4_0 1
#define M3_4_1 2 #define M3_4_1 2
#define M3_5_0 2 #define M3_5_0 2
#define M3_5_1 0 #define M3_5_1 0
#define M3_6_0 0 #define M3_6_0 0
#define M3_6_1 1 #define M3_6_1 1
#define M3_7_0 1 #define M3_7_0 1
#define M3_7_1 2 #define M3_7_1 2
#define M3_8_0 2 #define M3_8_0 2
#define M3_8_1 0 #define M3_8_1 0
#define M3_9_0 0 #define M3_9_0 0
#define M3_9_1 1 #define M3_9_1 1
#define M3_10_0 1 #define M3_10_0 1
#define M3_10_1 2 #define M3_10_1 2
#define M3_11_0 2 #define M3_11_0 2
#define M3_11_1 0 #define M3_11_1 0
#define M3_12_0 0 #define M3_12_0 0
#define M3_12_1 1 #define M3_12_1 1
#define M3_13_0 1 #define M3_13_0 1
#define M3_13_1 2 #define M3_13_1 2
#define M3_14_0 2 #define M3_14_0 2
#define M3_14_1 0 #define M3_14_1 0
#define M3_15_0 0 #define M3_15_0 0
#define M3_15_1 1 #define M3_15_1 1
#define M3_16_0 1 #define M3_16_0 1
#define M3_16_1 2 #define M3_16_1 2
#define M3_17_0 2 #define M3_17_0 2
#define M3_17_1 0 #define M3_17_1 0
#define M3_18_0 0 #define M3_18_0 0
#define M3_18_1 1 #define M3_18_1 1
#define XCAT(x, y) XCAT_(x, y) #define XCAT(x, y) XCAT_(x, y)
#define XCAT_(x, y) x ## y #define XCAT_(x, y) x ## y
#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) #define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i))
#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) #define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v))
#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \ #define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \
k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \
^ SPH_C64(0x1BD11BDAA9FC1A22); \ ^ SPH_C64(0x1BD11BDAA9FC1A22); \
t2 = t0 ^ t1; \ t2 = t0 ^ t1; \
} }
#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ #define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \
w0 = (w0 + SKBI(k, s, 0)); \ w0 = (w0 + SKBI(k, s, 0)); \
w1 = (w1 + SKBI(k, s, 1)); \ w1 = (w1 + SKBI(k, s, 1)); \
w2 = (w2 + SKBI(k, s, 2)); \ w2 = (w2 + SKBI(k, s, 2)); \
w3 = (w3 + SKBI(k, s, 3)); \ w3 = (w3 + SKBI(k, s, 3)); \
w4 = (w4 + SKBI(k, s, 4)); \ w4 = (w4 + SKBI(k, s, 4)); \
w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \
w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \
w7 = (w7 + SKBI(k, s, 7) + (uint64_t)s); \ w7 = (w7 + SKBI(k, s, 7) + (uint64_t)s); \
} }
#define TFBIG_MIX(x0, x1, rc) { \ #define TFBIG_MIX(x0, x1, rc) { \
x0 = x0 + x1; \ x0 = x0 + x1; \
x1 = ROTL64(x1, rc) ^ x0; \ x1 = ROTL64(x1, rc) ^ x0; \
} }
#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ #define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \
TFBIG_MIX(w0, w1, rc0); \ TFBIG_MIX(w0, w1, rc0); \
TFBIG_MIX(w2, w3, rc1); \ TFBIG_MIX(w2, w3, rc1); \
TFBIG_MIX(w4, w5, rc2); \ TFBIG_MIX(w4, w5, rc2); \
TFBIG_MIX(w6, w7, rc3); \ TFBIG_MIX(w6, w7, rc3); \
} }
#define TFBIG_4e(s) { \ #define TFBIG_4e(s) { \
TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \
TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \
TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \
TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \
} }
#define TFBIG_4o(s) { \ #define TFBIG_4o(s) { \
TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \
TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \
TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \
TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \
TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \
} }
static __constant__ uint64_t d_constMem[8]; static __constant__ uint64_t d_constMem[8];
static uint64_t h_constMem[8] = { static uint64_t h_constMem[8] = {
SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x4903ADFF749C51CE),
SPH_C64(0x0D95DE399746DF03), SPH_C64(0x0D95DE399746DF03),
SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x8FD1934127C79BCE),
SPH_C64(0x9A255629FF352CB1), SPH_C64(0x9A255629FF352CB1),
SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0x5DB62599DF6CA7B0),
SPH_C64(0xEABE394CA9D5C3F4), SPH_C64(0xEABE394CA9D5C3F4),
SPH_C64(0x991112C71A75B523), SPH_C64(0x991112C71A75B523),
SPH_C64(0xAE18A40B660FCC33) }; SPH_C64(0xAE18A40B660FCC33) };
__global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) __global__ void quark_skein512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// Skein // Skein
uint64_t p[8]; uint64_t p[8];
uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8; uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8;
uint64_t t0, t1, t2; uint64_t t0, t1, t2;
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint64_t *inpHash = &g_hash[8 * hashPosition]; uint64_t *inpHash = &g_hash[8 * hashPosition];
// Initialisierung // Initialisierung
h0 = d_constMem[0]; h0 = d_constMem[0];
h1 = d_constMem[1]; h1 = d_constMem[1];
h2 = d_constMem[2]; h2 = d_constMem[2];
h3 = d_constMem[3]; h3 = d_constMem[3];
h4 = d_constMem[4]; h4 = d_constMem[4];
h5 = d_constMem[5]; h5 = d_constMem[5];
h6 = d_constMem[6]; h6 = d_constMem[6];
h7 = d_constMem[7]; h7 = d_constMem[7];
// 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg // 1. Runde -> etype = 480, ptr = 64, bcount = 0, data = msg
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) for(int i=0;i<8;i++)
p[i] = inpHash[i]; p[i] = inpHash[i];
t0 = 64; // ptr t0 = 64; // ptr
t1 = 480ull << 55; // etype t1 = 480ull << 55; // etype
TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
TFBIG_4e(0); TFBIG_4e(0);
TFBIG_4o(1); TFBIG_4o(1);
TFBIG_4e(2); TFBIG_4e(2);
TFBIG_4o(3); TFBIG_4o(3);
TFBIG_4e(4); TFBIG_4e(4);
TFBIG_4o(5); TFBIG_4o(5);
TFBIG_4e(6); TFBIG_4e(6);
TFBIG_4o(7); TFBIG_4o(7);
TFBIG_4e(8); TFBIG_4e(8);
TFBIG_4o(9); TFBIG_4o(9);
TFBIG_4e(10); TFBIG_4e(10);
TFBIG_4o(11); TFBIG_4o(11);
TFBIG_4e(12); TFBIG_4e(12);
TFBIG_4o(13); TFBIG_4o(13);
TFBIG_4e(14); TFBIG_4e(14);
TFBIG_4o(15); TFBIG_4o(15);
TFBIG_4e(16); TFBIG_4e(16);
TFBIG_4o(17); TFBIG_4o(17);
TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
h0 = inpHash[0] ^ p[0]; h0 = inpHash[0] ^ p[0];
h1 = inpHash[1] ^ p[1]; h1 = inpHash[1] ^ p[1];
h2 = inpHash[2] ^ p[2]; h2 = inpHash[2] ^ p[2];
h3 = inpHash[3] ^ p[3]; h3 = inpHash[3] ^ p[3];
h4 = inpHash[4] ^ p[4]; h4 = inpHash[4] ^ p[4];
h5 = inpHash[5] ^ p[5]; h5 = inpHash[5] ^ p[5];
h6 = inpHash[6] ^ p[6]; h6 = inpHash[6] ^ p[6];
h7 = inpHash[7] ^ p[7]; h7 = inpHash[7] ^ p[7];
// 2. Runde -> etype = 510, ptr = 8, bcount = 0, data = 0 // 2. Runde -> etype = 510, ptr = 8, bcount = 0, data = 0
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) for(int i=0;i<8;i++)
p[i] = 0; p[i] = 0;
t0 = 8; // ptr t0 = 8; // ptr
t1 = 510ull << 55; // etype t1 = 510ull << 55; // etype
TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2);
TFBIG_4e(0); TFBIG_4e(0);
TFBIG_4o(1); TFBIG_4o(1);
TFBIG_4e(2); TFBIG_4e(2);
TFBIG_4o(3); TFBIG_4o(3);
TFBIG_4e(4); TFBIG_4e(4);
TFBIG_4o(5); TFBIG_4o(5);
TFBIG_4e(6); TFBIG_4e(6);
TFBIG_4o(7); TFBIG_4o(7);
TFBIG_4e(8); TFBIG_4e(8);
TFBIG_4o(9); TFBIG_4o(9);
TFBIG_4e(10); TFBIG_4e(10);
TFBIG_4o(11); TFBIG_4o(11);
TFBIG_4e(12); TFBIG_4e(12);
TFBIG_4o(13); TFBIG_4o(13);
TFBIG_4e(14); TFBIG_4e(14);
TFBIG_4o(15); TFBIG_4o(15);
TFBIG_4e(16); TFBIG_4e(16);
TFBIG_4o(17); TFBIG_4o(17);
TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
// fertig // fertig
uint64_t *outpHash = &g_hash[8 * hashPosition]; uint64_t *outpHash = &g_hash[8 * hashPosition];
#pragma unroll 8 #pragma unroll 8
for(int i=0;i<8;i++) for(int i=0;i<8;i++)
outpHash[i] = p[i]; outpHash[i] = p[i];
} }
} }
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_skein512_cpu_init(int thr_id, int threads) __host__ void quark_skein512_cpu_init(int thr_id, int threads)
{ {
// nix zu tun ;-) // nix zu tun ;-)
cudaMemcpyToSymbol( d_constMem, cudaMemcpyToSymbol( d_constMem,
h_constMem, h_constMem,
sizeof(h_constMem), sizeof(h_constMem),
0, cudaMemcpyHostToDevice); 0, cudaMemcpyHostToDevice);
} }
__host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
const int threadsperblock = 256; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

274
quark/quarkcoin.cu

@ -0,0 +1,274 @@
extern "C"
{
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "miner.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_quarkNonces[8];
static uint32_t *d_branch1Nonces[8];
static uint32_t *d_branch2Nonces[8];
static uint32_t *d_branch3Nonces[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order);
extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
int order);
// Original Quarkhash Funktion aus einem miner Quelltext
inline void quarkhash(void *state, const void *input)
{
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
unsigned char hash[64];
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, (void*) hash);
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
if (hash[0] & 0x8)
{
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
}
else
{
sph_skein512_init(&ctx_skein);
// ZSKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
}
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
if (hash[0] & 0x8)
{
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, (const void*) hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
}
else
{
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
}
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_skein512_init(&ctx_skein);
// SKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
if (hash[0] & 0x8)
{
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
}
else
{
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
}
memcpy(state, hash, 32);
}
extern bool opt_benchmark;
extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*4096; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
quark_compactTest_cpu_init(thr_id, throughput);
cudaMalloc(&d_quarkNonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput);
cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
do {
int order = 0;
size_t nrm1=0, nrm2=0, nrm3=0;
// erstes Blake512 Hash mit CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für BMW512
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
d_branch3Nonces[thr_id], &nrm3,
order++);
// nur den Skein Branch weiterverfolgen
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Blake512
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für Bmw512
quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
// quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8)
quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
// das ist der bedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
// das ist der bedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
quarkhash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/2;
return 1;
} else {
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = (pdata[19] - first_nonce + 1)/2;
return 0;
}
Loading…
Cancel
Save