Browse Source

v1.0 - Yo, I heard y'all like X11

master
Christian Buchner 11 years ago
parent
commit
af07302b4b
  1. 12
      Makefile.am
  2. 7
      README.txt
  3. 47
      ccminer.vcxproj
  4. 24
      ccminer.vcxproj.filters
  5. 2
      configure.ac
  6. 14
      cpu-miner.c
  7. 4
      cpuminer-config.h
  8. 2
      cuda_myriadgroestl.cu
  9. 4
      miner.h
  10. 402
      x11/cuda_x11_aes.cu
  11. 315
      x11/cuda_x11_cubehash512.cu
  12. 232
      x11/cuda_x11_echo.cu
  13. 384
      x11/cuda_x11_luffa512.cu
  14. 1380
      x11/cuda_x11_shavite512.cu
  15. 765
      x11/cuda_x11_simd512.cu
  16. 262
      x11/x11.cu

12
Makefile.am

@ -33,7 +33,10 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -33,7 +33,10 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \
cuda_nist5.cu
cuda_nist5.cu \
sph/cubehash.c sph/echo.c sph/luffa.c sph/shavite.c sph/simd.c \
x11/x11.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
@ -41,10 +44,13 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f @@ -41,10 +44,13 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
# we're now targeting all major compute architectures within one binary.
.cu.o:
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# Shavite compiles faster with 128 regs
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# ABI requiring code modules
# this module doesn't compile with Compute 2.0 unfortunately
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<

7
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccMiner release 0.9 (May 06th 2014) - "Say Hi to Quark, Anime"
ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?"
-------------------------------------------------------------
***************************************************************
@ -59,6 +59,8 @@ its command line interface and options. @@ -59,6 +59,8 @@ its command line interface and options.
jackpot use to mine Jackpotcoin
quark use to mine Quarkcoin
anime use to mine Animecoin
nist5 use to mine TalkCoin
x11 use to mine DarkCoin
-d, --devices gives a comma separated list of CUDA device IDs
to operate on. Device IDs start counting from 0!
@ -138,6 +140,9 @@ features. @@ -138,6 +140,9 @@ features.
>>> RELEASE HISTORY <<<
May 10th 2014 added X11, but without the bells & whistles
(no killer Groestl, SIMD hash quite slow still)
May 6th 2014 this adds the quark and animecoin algorithms.
May 3rd 2014 add the MjollnirCoin hash algorithm for the upcomin

47
ccminer.vcxproj

@ -431,6 +431,53 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command> @@ -431,6 +431,53 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_aes.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_cubehash512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_echo.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_luffa512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_shavite512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">128</MaxRegCount>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_simd512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="x11\x11.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">

24
ccminer.vcxproj.filters

@ -52,6 +52,9 @@ @@ -52,6 +52,9 @@
<Filter Include="Header Files\CUDA\heavy">
<UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\x11">
<UniqueIdentifier>{dd751f2d-bfd6-42c1-8f9b-cbe94e539353}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="compat\jansson\dump.c">
@ -313,5 +316,26 @@ @@ -313,5 +316,26 @@
<CudaCompile Include="cuda_nist5.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_aes.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_cubehash512.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_echo.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_luffa512.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_shavite512.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\cuda_x11_simd512.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\x11.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
</ItemGroup>
</Project>

2
configure.ac

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.05.06])
AC_INIT([ccminer], [2014.05.10])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

14
cpu-miner.c

@ -129,7 +129,8 @@ typedef enum { @@ -129,7 +129,8 @@ typedef enum {
ALGO_JACKPOT,
ALGO_QUARK,
ALGO_ANIME,
ALGO_NIST5
ALGO_NIST5,
ALGO_X11
} sha256_algos;
static const char *algo_names[] = {
@ -141,7 +142,8 @@ static const char *algo_names[] = { @@ -141,7 +142,8 @@ static const char *algo_names[] = {
"jackpot",
"quark",
"anime",
"nist5"
"nist5",
"x11"
};
bool opt_debug = false;
@ -212,6 +214,7 @@ Options:\n\ @@ -212,6 +214,7 @@ Options:\n\
quark Quark hash\n\
anime Animecoin hash\n\
nist5 NIST5 (TalkCoin) hash\n\
x11 X11 (DarkCoin) hash\n\
-d, --devices takes a comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\
string names of your cards like gtx780ti or gt640#2\n\
@ -901,6 +904,11 @@ static void *miner_thread(void *userdata) @@ -901,6 +904,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
case ALGO_X11:
rc = scanhash_x11(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
default:
/* should never happen */
goto out;
@ -1453,7 +1461,7 @@ static void signal_handler(int sig) @@ -1453,7 +1461,7 @@ static void signal_handler(int sig)
}
#endif
#define PROGRAM_VERSION "0.9"
#define PROGRAM_VERSION "1.0"
int main(int argc, char *argv[])
{
struct thr_info *thr;

4
cpuminer-config.h

@ -152,7 +152,7 @@ @@ -152,7 +152,7 @@
#define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.05.06"
#define PACKAGE_STRING "ccminer 2014.05.10"
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
@ -161,7 +161,7 @@ @@ -161,7 +161,7 @@
#undef PACKAGE_URL
/* Define to the version of this package. */
#define PACKAGE_VERSION "2014.05.06"
#define PACKAGE_VERSION "2014.05.10"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be

2
cuda_myriadgroestl.cu

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
// Auf Myriadcoin spezialisierte Version von Groestl
#include <cuda.h>
#include "cuda_runtime.h"

4
miner.h

@ -235,6 +235,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata, @@ -235,6 +235,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_x11(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len);

402
x11/cuda_x11_aes.cu

@ -0,0 +1,402 @@ @@ -0,0 +1,402 @@
/* AES Helper for inline-usage from SPH */
#define AESx(x) SPH_C32(x)
static const uint32_t h_AES0[256] = {
AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
};
static const uint32_t h_AES1[256] = {
AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
};
static const uint32_t h_AES2[256] = {
AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
};
static const uint32_t h_AES3[256] = {
AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
};
static __constant__ uint32_t d_AES0[256];
static __constant__ uint32_t d_AES1[256];
static __constant__ uint32_t d_AES2[256];
static __constant__ uint32_t d_AES3[256];
static void aes_cpu_init()
{
cudaMemcpyToSymbol( d_AES0,
h_AES0,
sizeof(h_AES0),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES1,
h_AES1,
sizeof(h_AES1),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES2,
h_AES2,
sizeof(h_AES2),
0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( d_AES3,
h_AES3,
sizeof(h_AES3),
0, cudaMemcpyHostToDevice);
}
static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
{
if(threadIdx.x < 256)
{
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
sharedMemory[threadIdx.x+768] = d_AES3[threadIdx.x];
}
}
static __device__ __forceinline__ void aes_round(
const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t k0,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{
uint32_t idx0, idx1, idx2, idx3;
idx0 = __byte_perm(x0, 0, 0x4440);
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
y0 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3] ^
k0;
idx0 = __byte_perm(x1, 0, 0x4440);
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
y1 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k3
idx0 = __byte_perm(x2, 0, 0x4440);
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
y2 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k2
idx0 = __byte_perm(x3, 0, 0x4440);
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
y3 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k3
}
static __device__ __forceinline__ void aes_round(
const uint32_t *sharedMemory,
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
{
uint32_t idx0, idx1, idx2, idx3;
idx0 = __byte_perm(x0, 0, 0x4440);
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
y0 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3];
idx0 = __byte_perm(x1, 0, 0x4440);
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
y1 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k3
idx0 = __byte_perm(x2, 0, 0x4440);
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
y2 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k2
idx0 = __byte_perm(x3, 0, 0x4440);
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
y3 =sharedMemory[idx0] ^
sharedMemory[idx1] ^
sharedMemory[idx2] ^
sharedMemory[idx3]; // ^k3
}

315
x11/cuda_x11_cubehash512.cu

@ -0,0 +1,315 @@ @@ -0,0 +1,315 @@
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return __byte_perm(x, 0, 0x0123);
}
typedef unsigned char BitSequence;
typedef unsigned long long DataLength;
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
typedef unsigned int uint32_t; /* must be exactly 32 bits */
#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
__constant__ uint32_t c_IV_512[32];
static const uint32_t h_IV_512[32] = {
0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
0x4167D83E, 0x3FEE2313, 0xC701CF8C,
0xCC39968E, 0x50AC5695, 0x4D42C787,
0xA647A8B3, 0x97CF0BEF, 0x825B4537,
0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
0xA23911AE, 0xFCD398D9, 0x148FE485,
0x1B017BEF, 0xB6444532, 0x6A536159,
0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
0xBC796576, 0x1921C8F7, 0xE7989AF1,
0x7795D246, 0xD43E3B44
};
static __device__ void rrounds(uint32_t x[2][2][2][2][2])
{
int r;
int j;
int k;
int l;
int m;
//#pragma unroll 16
for (r = 0;r < CUBEHASH_ROUNDS;++r) {
/* "add x_0jklm into x_1jklmn modulo 2^32" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
/* "rotate x_0jklm upwards by 7 bits" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
/* "swap x_00klm with x_01klm" */
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
SWAP(x[0][0][k][l][m],x[0][1][k][l][m])
/* "xor x_1jklm into x_0jklm" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jk0m with x_1jk1m" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (m = 0;m < 2;++m)
SWAP(x[1][j][k][0][m],x[1][j][k][1][m])
/* "add x_0jklm into x_1jklm modulo 2^32" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[1][j][k][l][m] += x[0][j][k][l][m];
/* "rotate x_0jklm upwards by 11 bits" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
/* "swap x_0j0lm with x_0j1lm" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
SWAP(x[0][j][0][l][m],x[0][j][1][l][m])
/* "xor x_1jklm into x_0jklm" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][j][k][l][m] ^= x[1][j][k][l][m];
/* "swap x_1jkl0 with x_1jkl1" */
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
SWAP(x[1][j][k][l][0],x[1][j][k][l][1])
}
}
static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
{
int k;
int l;
int m;
uint32_t *in = block;
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[0][0][k][l][m] ^= *in++;
}
static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
{
int j;
int k;
int l;
int m;
uint32_t *out = hash;
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
*out++ = x[0][j][k][l][m];
}
void __device__ Init(uint32_t x[2][2][2][2][2])
{
int i,j,k,l,m;
#if 0
/* "the first three state words x_00000, x_00001, x_00010" */
/* "are set to the integers h/8, b, r respectively." */
/* "the remaining state words are set to 0." */
#pragma unroll 2
for (i = 0;i < 2;++i)
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[i][j][k][l][m] = 0;
x[0][0][0][0][0] = 512/8;
x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
x[0][0][0][1][0] = CUBEHASH_ROUNDS;
/* "the state is then transformed invertibly through 10r identical rounds */
for (i = 0;i < 10;++i) rrounds(x);
#else
uint32_t *iv = c_IV_512;
#pragma unroll 2
for (i = 0;i < 2;++i)
#pragma unroll 2
for (j = 0;j < 2;++j)
#pragma unroll 2
for (k = 0;k < 2;++k)
#pragma unroll 2
for (l = 0;l < 2;++l)
#pragma unroll 2
for (m = 0;m < 2;++m)
x[i][j][k][l][m] = *iv++;
#endif
}
void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
{
/* "xor the block into the first b bytes of the state" */
/* "and then transform the state invertibly through r identical rounds" */
block_tox((uint32_t*)data, x);
rrounds(x);
}
void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
{
int i;
/* "the integer 1 is xored into the last state word x_11111" */
x[1][1][1][1][1] ^= 1;
/* "the state is then transformed invertibly through 10r identical rounds" */
#pragma unroll 10
for (i = 0;i < 10;++i) rrounds(x);
/* "output the first h/8 bytes of the state" */
hash_fromx((uint32_t*)hashval, x);
}
/***************************************************/
// Die Hash-Funktion
__global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
uint32_t x[2][2][2][2][2];
Init(x);
// erste Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)Hash);
// zweite Hälfte des Hashes (32 bytes)
Update32(x, (const BitSequence*)(Hash+8));
// Padding Block
uint32_t last[8];
last[0] = 0x80;
#pragma unroll 7
for (int i=1; i < 8; i++) last[i] = 0;
Update32(x, (const BitSequence*)last);
Final(x, (BitSequence*)Hash);
}
}
// Setup-Funktionen
__host__ void x11_cubehash512_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
}
__host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

232
x11/cuda_x11_echo.cu

@ -0,0 +1,232 @@ @@ -0,0 +1,232 @@
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <memory.h>
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
#define SPH_C64(x) ((uint64_t)(x ## ULL))
#define SPH_C32(x) ((uint32_t)(x ## U))
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
#include "cuda_x11_aes.cu"
__device__ __forceinline__ void AES_2ROUND(
const uint32_t* __restrict__ sharedMemory,
uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
{
uint32_t y0, y1, y2, y3;
aes_round(sharedMemory,
x0, x1, x2, x3,
k0,
y0, y1, y2, y3);
aes_round(sharedMemory,
y0, y1, y2, y3,
x0, x1, x2, x3);
// hier werden wir ein carry brauchen (oder auch nicht)
k0++;
}
__device__ __forceinline__ void cuda_echo_round(
const uint32_t *sharedMemory,
uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3,
uint32_t *W, int round)
{
// W hat 16*4 als Abmaße
// Big Sub Words
#pragma unroll 16
for(int i=0;i<16;i++)
{
int idx = i<<2; // *4
AES_2ROUND(sharedMemory,
W[idx+0], W[idx+1], W[idx+2], W[idx+3],
k0, k1, k2, k3);
}
// Shift Rows
#pragma unroll 4
for(int i=0;i<4;i++)
{
uint32_t t;
/// 1, 5, 9, 13
t = W[4 + i];
W[4 + i] = W[20 + i];
W[20 + i] = W[36 + i];
W[36 + i] = W[52 + i];
W[52 + i] = t;
// 2, 6, 10, 14
t = W[8 + i];
W[8 + i] = W[40 + i];
W[40 + i] = t;
t = W[24 + i];
W[24 + i] = W[56 + i];
W[56 + i] = t;
// 15, 11, 7, 3
t = W[60 + i];
W[60 + i] = W[44 + i];
W[44 + i] = W[28 + i];
W[28 + i] = W[12 + i];
W[12 + i] = t;
}
// Mix Columns
#pragma unroll 4
for(int i=0;i<4;i++) // Schleife über je 2*uint32_t
{
#pragma unroll 4
for(int j=0;j<4;j++) // Schleife über die elemnte
{
int idx = j<<2; // j*4
uint32_t a = W[ ((idx + 0)<<2) + i];
uint32_t b = W[ ((idx + 1)<<2) + i];
uint32_t c = W[ ((idx + 2)<<2) + i];
uint32_t d = W[ ((idx + 3)<<2) + i];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t;
t = ((ab & 0x80808080) >> 7);
uint32_t abx = t<<4 ^ t<<3 ^ t<<1 ^ t;
t = ((bc & 0x80808080) >> 7);
uint32_t bcx = t<<4 ^ t<<3 ^ t<<1 ^ t;
t = ((cd & 0x80808080) >> 7);
uint32_t cdx = t<<4 ^ t<<3 ^ t<<1 ^ t;
abx ^= ((ab & 0x7F7F7F7F) << 1);
bcx ^= ((bc & 0x7F7F7F7F) << 1);
cdx ^= ((cd & 0x7F7F7F7F) << 1);
W[ ((idx + 0)<<2) + i] = abx ^ bc ^ d;
W[ ((idx + 1)<<2) + i] = bcx ^ a ^ cd;
W[ ((idx + 2)<<2) + i] = cdx ^ ab ^ d;
W[ ((idx + 3)<<2) + i] = abx ^ bcx ^ cdx ^ ab ^ c;
}
}
}
__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
__shared__ uint32_t sharedMemory[1024];
aes_gpu_init(sharedMemory);
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
uint32_t W[64];
uint32_t k0 = 512, k1 = 0, k2 = 0, k3 = 0; // K0 = bitlen
/* Initialisierung */
#pragma unroll 8
for(int i=0;i<32;i+=4)
{
W[i + 0] = 512;
W[i + 1] = 0;
W[i + 2] = 0;
W[i + 3] = 0;
}
// kopiere 32-byte großen hash
#pragma unroll 16
for(int i=0;i<16;i++)
W[i+32] = Hash[i];
W[48] = 0x80; // fest
#pragma unroll 10
for(int i=49;i<59;i++)
W[i] = 0;
W[59] = 0x02000000; // fest
W[60] = k0; // bitlen
W[61] = k1;
W[62] = k2;
W[63] = k3;
for(int i=0;i<10;i++)
{
cuda_echo_round(sharedMemory, k0, k1, k2, k3, W, i);
}
#pragma unroll 8
for(int i=0;i<32;i+=4)
{
W[i ] ^= W[32 + i ] ^ 512;
W[i+1] ^= W[32 + i + 1];
W[i+2] ^= W[32 + i + 2];
W[i+3] ^= W[32 + i + 3];
}
#pragma unroll 16
for(int i=0;i<16;i++)
W[i] ^= Hash[i];
W[8] ^= 0x10;
W[27] ^= 0x02000000;
W[28] ^= k0;
#pragma unroll 16
for(int i=0;i<16;i++)
Hash[i] = W[i];
}
}
// Setup-Funktionen
__host__ void x11_echo512_cpu_init(int thr_id, int threads)
{
aes_cpu_init();
}
__host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

384
x11/cuda_x11_luffa512.cu

@ -0,0 +1,384 @@ @@ -0,0 +1,384 @@
/*
* luffa_for_32.c
* Version 2.0 (Sep 15th 2009)
*
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
*
* Hitachi, Ltd. is the owner of this software and hereby grant
* the U.S. Government and any interested party the right to use
* this software for the purposes of the SHA-3 evaluation process,
* notwithstanding that this software is copyrighted.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned char BitSequence;
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
typedef struct {
uint32_t buffer[8]; /* Buffer to be hashed */
uint32_t chainv[40]; /* Chaining values */
} hashState;
static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
{
return __byte_perm(x, x, 0x0123);
}
#define MULT2(a,j)\
tmp = a[7+(8*j)];\
a[7+(8*j)] = a[6+(8*j)];\
a[6+(8*j)] = a[5+(8*j)];\
a[5+(8*j)] = a[4+(8*j)];\
a[4+(8*j)] = a[3+(8*j)] ^ tmp;\
a[3+(8*j)] = a[2+(8*j)] ^ tmp;\
a[2+(8*j)] = a[1+(8*j)];\
a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
a[0+(8*j)] = tmp;
#define TWEAK(a0,a1,a2,a3,j)\
a0 = (a0<<(j))|(a0>>(32-j));\
a1 = (a1<<(j))|(a1>>(32-j));\
a2 = (a2<<(j))|(a2>>(32-j));\
a3 = (a3<<(j))|(a3>>(32-j));
#define STEP(c0,c1)\
SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
MIXWORD(chainv[0],chainv[4]);\
MIXWORD(chainv[1],chainv[5]);\
MIXWORD(chainv[2],chainv[6]);\
MIXWORD(chainv[3],chainv[7]);\
ADD_CONSTANT(chainv[0],chainv[4],c0,c1);
#define SUBCRUMB(a0,a1,a2,a3,a4)\
a4 = a0;\
a0 |= a1;\
a2 ^= a3;\
a1 = ~a1;\
a0 ^= a3;\
a3 &= a4;\
a1 ^= a3;\
a3 ^= a2;\
a2 &= a0;\
a0 = ~a0;\
a2 ^= a1;\
a1 |= a3;\
a4 ^= a1;\
a3 ^= a2;\
a2 &= a1;\
a1 ^= a0;\
a0 = a4;
#define MIXWORD(a0,a4)\
a4 ^= a0;\
a0 = (a0<<2) | (a0>>(30));\
a0 ^= a4;\
a4 = (a4<<14) | (a4>>(18));\
a4 ^= a0;\
a0 = (a0<<10) | (a0>>(22));\
a0 ^= a4;\
a4 = (a4<<1) | (a4>>(31));
#define ADD_CONSTANT(a0,b0,c0,c1)\
a0 ^= c0;\
b0 ^= c1;
/* initial values of chaining variables */
__constant__ uint32_t c_IV[40];
const uint32_t h_IV[40] = {
0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
0x6e292011,0x90152df4,0xee058139,0xdef610bb,
0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581,
0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05,
0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7,
0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67,
0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce,
0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
__constant__ uint32_t c_CNS[80];
uint32_t h_CNS[80] = {
0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d,
0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4,
0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28,
0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b,
0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704,
0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72,
0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7,
0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719,
0xd9847356,0x36eda57f,0xa2c78434,0x703aace7,
0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91,
0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be,
0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5,
0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355,
0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab,
0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0,
0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31};
/***************************************************/
__device__ __forceinline__ void rnd512(hashState *state)
{
int i,j;
uint32_t t[40];
uint32_t chainv[8];
uint32_t tmp;
#pragma unroll 8
for(i=0;i<8;i++) {
t[i]=0;
#pragma unroll 5
for(j=0;j<5;j++) {
t[i] ^= state->chainv[i+8*j];
}
}
MULT2(t, 0);
#pragma unroll 5
for(j=0;j<5;j++) {
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[i+8*j] ^= t[i];
}
}
#pragma unroll 5
for(j=0;j<5;j++) {
#pragma unroll 8
for(i=0;i<8;i++) {
t[i+8*j] = state->chainv[i+8*j];
}
}
#pragma unroll 5
for(j=0;j<5;j++) {
MULT2(state->chainv, j);
}
#pragma unroll 5
for(j=0;j<5;j++) {
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[8*j+i] ^= t[8*((j+1)%5)+i];
}
}
#pragma unroll 5
for(j=0;j<5;j++) {
#pragma unroll 8
for(i=0;i<8;i++) {
t[i+8*j] = state->chainv[i+8*j];
}
}
#pragma unroll 5
for(j=0;j<5;j++) {
MULT2(state->chainv, j);
}
#pragma unroll 5
for(j=0;j<5;j++) {
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[8*j+i] ^= t[8*((j+4)%5)+i];
}
}
#pragma unroll 5
for(j=0;j<5;j++) {
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[i+8*j] ^= state->buffer[i];
}
MULT2(state->buffer, 0);
}
#pragma unroll 8
for(i=0;i<8;i++) {
chainv[i] = state->chainv[i];
}
#pragma unroll 8
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]);
}
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[i] = chainv[i];
chainv[i] = state->chainv[i+8];
}
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
#pragma unroll 8
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]);
}
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[i+8] = chainv[i];
chainv[i] = state->chainv[i+16];
}
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
#pragma unroll 8
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
}
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[i+16] = chainv[i];
chainv[i] = state->chainv[i+24];
}
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
#pragma unroll 8
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
}
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[i+24] = chainv[i];
chainv[i] = state->chainv[i+32];
}
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
#pragma unroll 8
for(i=0;i<8;i++) {
STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
}
#pragma unroll 8
for(i=0;i<8;i++) {
state->chainv[i+32] = chainv[i];
}
}
__device__ __forceinline__ void Update512(hashState *state, const BitSequence *data)
{
#pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
rnd512(state);
#pragma unroll 8
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
rnd512(state);
}
/***************************************************/
__device__ __forceinline__ void finalization512(hashState *state, uint32_t *b)
{
int i,j;
state->buffer[0] = 0x80000000;
#pragma unroll 7
for(int i=1;i<8;i++) state->buffer[i] = 0;
rnd512(state);
/*---- blank round with m=0 ----*/
#pragma unroll 8
for(i=0;i<8;i++) state->buffer[i] =0;
rnd512(state);
#pragma unroll 8
for(i=0;i<8;i++) {
b[i] = 0;
#pragma unroll 5
for(j=0;j<5;j++) {
b[i] ^= state->chainv[i+8*j];
}
b[i] = BYTES_SWAP32((b[i]));
}
#pragma unroll 8
for(i=0;i<8;i++) state->buffer[i]=0;
rnd512(state);
#pragma unroll 8
for(i=0;i<8;i++) {
b[8+i] = 0;
#pragma unroll 5
for(j=0;j<5;j++) {
b[8+i] ^= state->chainv[i+8*j];
}
b[8+i] = BYTES_SWAP32((b[8+i]));
}
}
/***************************************************/
// Die Hash-Funktion
__global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
hashState state;
#pragma unroll 40
for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
#pragma unroll 8
for(int i=0;i<8;i++) state.buffer[i] = 0;
Update512(&state, (BitSequence*)Hash);
finalization512(&state, (uint32_t*)Hash);
}
}
// Setup-Funktionen
__host__ void x11_luffa512_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol( c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice );
cudaMemcpyToSymbol( c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice );
}
__host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

1380
x11/cuda_x11_shavite512.cu

File diff suppressed because it is too large Load Diff

765
x11/cuda_x11_simd512.cu

@ -0,0 +1,765 @@ @@ -0,0 +1,765 @@
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define C32(x) ((uint32_t)(x ## U))
#define T32(x) ((x) & C32(0xFFFFFFFF))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
#else
// Kepler (Compute 3.5)
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
#endif
__constant__ uint32_t c_IV_512[32];
const uint32_t h_IV_512[32] = {
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
};
__constant__ int c_FFT[256];
const int h_FFT[256] =
{
// this is the FFT result in revbin permuted order
4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92,
-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63,
-63, 107, -107, -76, 76, -119, 119, -83, 83, 126, -126, 94, -94, -23, 23, -76,
76, -47, 47, 92, -92, -117, 117, 73, -73, -53, 53, 88, -88, -80, 80, -47, 47,
5, -5, 67, -67, 34, -34, 4, -4, 87, -87, -28, 28, -70, 70, -110, 110, -18, 18, 93,
-93, 51, -51, 36, -36, 118, -118, -106, 106, 45, -45, -108, 108, -44, 44, 117,
-117, -121, 121, -37, 37, 65, -65, 37, -37, 40, -40, -42, 42, 91, -91, -128, 128,
-21, 21, 94, -94, -98, 98, -47, 47, 28, -28, 115, -115, 16, -16, -20, 20, 122,
-122, 115, -115, 46, -46, 84, -84, -127, 127, 57, -57, 127, -127, -80, 80, 24,
-24, 15, -15, 29, -29, -78, 78, -126, 126, 16, -16, 52, -52, 55, -55, 110, -110,
-51, 51, -120, 120, -124, 124, -24, 24, -76, 76, 26, -26, -21, 21, -64, 64, -99,
99, 85, -85, -15, 15, -120, 120, -116, 116, 85, -85, 12, -12, -24, 24, 4, -4,
79, -79, 76, -76, 23, -23, 4, -4, -108, 108, -20, 20, 73, -73, -42, 42, -7, 7,
-29, 29, -123, 123, 49, -49, -96, 96, -68, 68, -112, 112, 116, -116, -24, 24, 93,
-93, -125, 125, -86, 86, 117, -117, -91, 91, 42, -42, 87, -87, -117, 117, 102, -102
};
__constant__ int c_P4[32][4];
static const int h_P4[32][4] = {
{ 2, 34, 18, 50 },
{ 6, 38, 22, 54 },
{ 0, 32, 16, 48 },
{ 4, 36, 20, 52 },
{ 14, 46, 30, 62 },
{ 10, 42, 26, 58 },
{ 12, 44, 28, 60 },
{ 8, 40, 24, 56 },
{ 15, 47, 31, 63 },
{ 13, 45, 29, 61 },
{ 3, 35, 19, 51 },
{ 1, 33, 17, 49 },
{ 9, 41, 25, 57 },
{ 11, 43, 27, 59 },
{ 5, 37, 21, 53 },
{ 7, 39, 23, 55 },
{ 8, 40, 24, 56 },
{ 4, 36, 20, 52 },
{ 14, 46, 30, 62 },
{ 2, 34, 18, 50 },
{ 6, 38, 22, 54 },
{ 10, 42, 26, 58 },
{ 0, 32, 16, 48 },
{ 12, 44, 28, 60 },
{ 70, 102, 86, 118 },
{ 64, 96, 80, 112 },
{ 72, 104, 88, 120 },
{ 78, 110, 94, 126 },
{ 76, 108, 92, 124 },
{ 74, 106, 90, 122 },
{ 66, 98, 82, 114 },
{ 68, 100, 84, 116 }
};
__constant__ int c_Q4[32][4];
static const int h_Q4[32][4] = {
{ 66, 98, 82, 114 },
{ 70, 102, 86, 118 },
{ 64, 96, 80, 112 },
{ 68, 100, 84, 116 },
{ 78, 110, 94, 126 },
{ 74, 106, 90, 122 },
{ 76, 108, 92, 124 },
{ 72, 104, 88, 120 },
{ 79, 111, 95, 127 },
{ 77, 109, 93, 125 },
{ 67, 99, 83, 115 },
{ 65, 97, 81, 113 },
{ 73, 105, 89, 121 },
{ 75, 107, 91, 123 },
{ 69, 101, 85, 117 },
{ 71, 103, 87, 119 },
{ 9, 41, 25, 57 },
{ 5, 37, 21, 53 },
{ 15, 47, 31, 63 },
{ 3, 35, 19, 51 },
{ 7, 39, 23, 55 },
{ 11, 43, 27, 59 },
{ 1, 33, 17, 49 },
{ 13, 45, 29, 61 },
{ 71, 103, 87, 119 },
{ 65, 97, 81, 113 },
{ 73, 105, 89, 121 },
{ 79, 111, 95, 127 },
{ 77, 109, 93, 125 },
{ 75, 107, 91, 123 },
{ 67, 99, 83, 115 },
{ 69, 101, 85, 117 }
};
__constant__ int c_P8[32][8];
static const int h_P8[32][8] = {
{ 2, 66, 34, 98, 18, 82, 50, 114 },
{ 6, 70, 38, 102, 22, 86, 54, 118 },
{ 0, 64, 32, 96, 16, 80, 48, 112 },
{ 4, 68, 36, 100, 20, 84, 52, 116 },
{ 14, 78, 46, 110, 30, 94, 62, 126 },
{ 10, 74, 42, 106, 26, 90, 58, 122 },
{ 12, 76, 44, 108, 28, 92, 60, 124 },
{ 8, 72, 40, 104, 24, 88, 56, 120 },
{ 15, 79, 47, 111, 31, 95, 63, 127 },
{ 13, 77, 45, 109, 29, 93, 61, 125 },
{ 3, 67, 35, 99, 19, 83, 51, 115 },
{ 1, 65, 33, 97, 17, 81, 49, 113 },
{ 9, 73, 41, 105, 25, 89, 57, 121 },
{ 11, 75, 43, 107, 27, 91, 59, 123 },
{ 5, 69, 37, 101, 21, 85, 53, 117 },
{ 7, 71, 39, 103, 23, 87, 55, 119 },
{ 8, 72, 40, 104, 24, 88, 56, 120 },
{ 4, 68, 36, 100, 20, 84, 52, 116 },
{ 14, 78, 46, 110, 30, 94, 62, 126 },
{ 2, 66, 34, 98, 18, 82, 50, 114 },
{ 6, 70, 38, 102, 22, 86, 54, 118 },
{ 10, 74, 42, 106, 26, 90, 58, 122 },
{ 0, 64, 32, 96, 16, 80, 48, 112 },
{ 12, 76, 44, 108, 28, 92, 60, 124 },
{ 134, 198, 166, 230, 150, 214, 182, 246 },
{ 128, 192, 160, 224, 144, 208, 176, 240 },
{ 136, 200, 168, 232, 152, 216, 184, 248 },
{ 142, 206, 174, 238, 158, 222, 190, 254 },
{ 140, 204, 172, 236, 156, 220, 188, 252 },
{ 138, 202, 170, 234, 154, 218, 186, 250 },
{ 130, 194, 162, 226, 146, 210, 178, 242 },
{ 132, 196, 164, 228, 148, 212, 180, 244 },
};
__constant__ int c_Q8[32][8];
static const int h_Q8[32][8] = {
{ 130, 194, 162, 226, 146, 210, 178, 242 },
{ 134, 198, 166, 230, 150, 214, 182, 246 },
{ 128, 192, 160, 224, 144, 208, 176, 240 },
{ 132, 196, 164, 228, 148, 212, 180, 244 },
{ 142, 206, 174, 238, 158, 222, 190, 254 },
{ 138, 202, 170, 234, 154, 218, 186, 250 },
{ 140, 204, 172, 236, 156, 220, 188, 252 },
{ 136, 200, 168, 232, 152, 216, 184, 248 },
{ 143, 207, 175, 239, 159, 223, 191, 255 },
{ 141, 205, 173, 237, 157, 221, 189, 253 },
{ 131, 195, 163, 227, 147, 211, 179, 243 },
{ 129, 193, 161, 225, 145, 209, 177, 241 },
{ 137, 201, 169, 233, 153, 217, 185, 249 },
{ 139, 203, 171, 235, 155, 219, 187, 251 },
{ 133, 197, 165, 229, 149, 213, 181, 245 },
{ 135, 199, 167, 231, 151, 215, 183, 247 },
{ 9, 73, 41, 105, 25, 89, 57, 121 },
{ 5, 69, 37, 101, 21, 85, 53, 117 },
{ 15, 79, 47, 111, 31, 95, 63, 127 },
{ 3, 67, 35, 99, 19, 83, 51, 115 },
{ 7, 71, 39, 103, 23, 87, 55, 119 },
{ 11, 75, 43, 107, 27, 91, 59, 123 },
{ 1, 65, 33, 97, 17, 81, 49, 113 },
{ 13, 77, 45, 109, 29, 93, 61, 125 },
{ 135, 199, 167, 231, 151, 215, 183, 247 },
{ 129, 193, 161, 225, 145, 209, 177, 241 },
{ 137, 201, 169, 233, 153, 217, 185, 249 },
{ 143, 207, 175, 239, 159, 223, 191, 255 },
{ 141, 205, 173, 237, 157, 221, 189, 253 },
{ 139, 203, 171, 235, 155, 219, 187, 251 },
{ 131, 195, 163, 227, 147, 211, 179, 243 },
{ 133, 197, 165, 229, 149, 213, 181, 245 },
};
__constant__ int c_FFT64_8_8_Twiddle[64];
static const int h_FFT64_8_8_Twiddle[64] = {
1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 4, 8, 16, 32, 64, 128,
1, 60, 2, 120, 4, -17, 8, -34,
1, 120, 8, -68, 64, -30, -2, 17,
1, 46, 60, -67, 2, 92, 120, 123,
1, 92, -17, -22, 32, 117, -30, 67,
1, -67, 120, -73, 8, -22, -68, -70,
1, 123, -34, -70, 128, 67, 17, 35,
};
__constant__ int c_FFT128_2_64_Twiddle[64];
static const int h_FFT128_2_64_Twiddle[64] = {
1, -118, 46, -31, 60, 116, -67, -61,
2, 21, 92, -62, 120, -25, 123, -122,
4, 42, -73, -124, -17, -50, -11, 13,
8, 84, 111, 9, -34, -100, -22, 26,
16, -89, -35, 18, -68, 57, -44, 52,
32, 79, -70, 36, 121, 114, -88, 104,
64, -99, 117, 72, -15, -29, 81, -49,
128, 59, -23, -113, -30, -58, -95, -98
};
__constant__ int c_FFT128_16_8_Twiddle[128];
static const int h_FFT128_16_8_Twiddle[128] = {
1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 4, 8, 16, 32, 64, 128,
1, 60, 2, 120, 4, -17, 8, -34,
1, 120, 8, -68, 64, -30, -2, 17,
1, 46, 60, -67, 2, 92, 120, 123,
1, 92, -17, -22, 32, 117, -30, 67,
1, -67, 120, -73, 8, -22, -68, -70,
1, 123, -34, -70, 128, 67, 17, 35,
1, -118, 46, -31, 60, 116, -67, -61,
1, 21, -73, 9, -68, 114, 81, -98,
1, 116, 92, -122, -17, 84, -22, 18,
1, -25, 111, 52, -15, 118, -123, -9,
1, -31, -67, 21, 120, -122, -73, -50,
1, -62, -11, -89, 121, -49, -46, 25,
1, -61, 123, -50, -34, 18, -70, -99,
1, -122, -22, 114, -30, 62, -111, -79 };
__constant__ int c_FFT128_8_16_Twiddle[128];
static const int h_FFT128_8_16_Twiddle[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111,
1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122,
1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62,
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79};
__constant__ int c_FFT256_2_128_Twiddle[128];
static const int h_FFT256_2_128_Twiddle[128] = {
1, 41, -118, 45, 46, 87, -31, 14,
60, -110, 116, -127, -67, 80, -61, 69,
2, 82, 21, 90, 92, -83, -62, 28,
120, 37, -25, 3, 123, -97, -122, -119,
4, -93, 42, -77, -73, 91, -124, 56,
-17, 74, -50, 6, -11, 63, 13, 19,
8, 71, 84, 103, 111, -75, 9, 112,
-34, -109, -100, 12, -22, 126, 26, 38,
16, -115, -89, -51, -35, 107, 18, -33,
-68, 39, 57, 24, -44, -5, 52, 76,
32, 27, 79, -102, -70, -43, 36, -66,
121, 78, 114, 48, -88, -10, 104, -105,
64, 54, -99, 53, 117, -86, 72, 125,
-15, -101, -29, 96, 81, -20, -49, 47,
128, 108, 59, 106, -23, 85, -113, -7,
-30, 55, -58, -65, -95, -40, -98, 94};
#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \
((x)%7) == 1 ? 6 : \
((x)%7) == 2 ? 2 : \
((x)%7) == 3 ? 3 : \
((x)%7) == 4 ? 5 : \
((x)%7) == 5 ? 7 : \
4 )
/************* the round function ****************/
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
__device__ __forceinline__ void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
{
int j;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
#pragma unroll 8
for(j=0; j<8; j++) {
D[j] = D[j] + w[j] + IF(A[j], B[j], C[j]);
D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]);
A[j] = R[j];
}
}
__device__ __forceinline__ void STEP8_MAJ(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
{
int j;
uint32_t R[8];
#pragma unroll 8
for(j=0; j<8; j++) {
R[j] = ROTL32(A[j], r);
}
#pragma unroll 8
for(j=0; j<8; j++) {
D[j] = D[j] + w[j] + MAJ(A[j], B[j], C[j]);
D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]);
A[j] = R[j];
}
}
__device__ __forceinline__ void Round8(uint32_t A[128], const int y[128], int i,
int r, int s, int t, int u) {
int code = i<2? 185: 233;
uint32_t w[8][8];
int a, b;
/*
* The FFT output y is in revbin permuted order,
* but this is included in the tables P and Q
*/
#pragma unroll 8
for(a=0; a<8; a++)
#pragma unroll 8
for(b=0; b<8; b++)
w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410);
STEP8_IF(w[0], 8*i+0, r, s, A, &A[8], &A[16], &A[24]);
STEP8_IF(w[1], 8*i+1, s, t, &A[24], A, &A[8], &A[16]);
STEP8_IF(w[2], 8*i+2, t, u, &A[16], &A[24], A, &A[8]);
STEP8_IF(w[3], 8*i+3, u, r, &A[8], &A[16], &A[24], A);
STEP8_MAJ(w[4], 8*i+4, r, s, A, &A[8], &A[16], &A[24]);
STEP8_MAJ(w[5], 8*i+5, s, t, &A[24], A, &A[8], &A[16]);
STEP8_MAJ(w[6], 8*i+6, t, u, &A[16], &A[24], A, &A[8]);
STEP8_MAJ(w[7], 8*i+7, u, r, &A[8], &A[16], &A[24], A);
}
/********************* Message expansion ************************/
/*
* Reduce modulo 257; result is in [-127; 383]
* REDUCE(x) := (x&255) - (x>>8)
*/
#define REDUCE(x) (((x)&255) - ((x)>>8))
/*
* Reduce from [-127; 383] to [-128; 128]
* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
*/
#define EXTRA_REDUCE_S(x) \
((x)<=128 ? (x) : (x)-257)
/*
* Reduce modulo 257; result is in [-128; 128]
*/
#define REDUCE_FULL_S(x) \
EXTRA_REDUCE_S(REDUCE(x))
__device__ __forceinline__ void FFT_8(int *y, int stripe) {
/*
* FFT_8 using w=4 as 8th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
#define X(i) y[stripe*i]
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) \
do { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
#define BUTTERFLY(i,j,n) \
do { \
int u= X(i); \
int v= X(j); \
X(i) = u+v; \
X(j) = (u-v) << (2*n); \
} while(0)
BUTTERFLY(0, 4, 0);
BUTTERFLY(1, 5, 1);
BUTTERFLY(2, 6, 2);
BUTTERFLY(3, 7, 3);
DO_REDUCE(6);
DO_REDUCE(7);
BUTTERFLY(0, 2, 0);
BUTTERFLY(4, 6, 0);
BUTTERFLY(1, 3, 2);
BUTTERFLY(5, 7, 2);
DO_REDUCE(7);
BUTTERFLY(0, 1, 0);
BUTTERFLY(2, 3, 0);
BUTTERFLY(4, 5, 0);
BUTTERFLY(6, 7, 0);
DO_REDUCE_FULL_S(0);
DO_REDUCE_FULL_S(1);
DO_REDUCE_FULL_S(2);
DO_REDUCE_FULL_S(3);
DO_REDUCE_FULL_S(4);
DO_REDUCE_FULL_S(5);
DO_REDUCE_FULL_S(6);
DO_REDUCE_FULL_S(7);
#undef X
#undef DO_REDUCE
#undef DO_REDUCE_FULL_S
#undef BUTTERFLY
}
__device__ __forceinline__ void FFT_16(int *y, int stripe) {
/*
* FFT_16 using w=2 as 16th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT.
* Output data is in revbin_permuted order.
*/
#define X(i) y[stripe*i]
#define DO_REDUCE(i) \
X(i) = REDUCE(X(i))
#define DO_REDUCE_FULL_S(i) \
do { \
X(i) = REDUCE(X(i)); \
X(i) = EXTRA_REDUCE_S(X(i)); \
} while(0)
#define BUTTERFLY(i,j,n) \
do { \
int u= X(i); \
int v= X(j); \
X(i) = u+v; \
X(j) = (u-v) << n; \
} while(0)
BUTTERFLY(0, 8, 0);
BUTTERFLY(1, 9, 1);
BUTTERFLY(2, 10, 2);
BUTTERFLY(3, 11, 3);
BUTTERFLY(4, 12, 4);
BUTTERFLY(5, 13, 5);
BUTTERFLY(6, 14, 6);
BUTTERFLY(7, 15, 7);
DO_REDUCE(11);
DO_REDUCE(12);
DO_REDUCE(13);
DO_REDUCE(14);
DO_REDUCE(15);
BUTTERFLY( 0, 4, 0);
BUTTERFLY( 8, 12, 0);
BUTTERFLY( 1, 5, 2);
BUTTERFLY( 9, 13, 2);
BUTTERFLY( 2, 6, 4);
BUTTERFLY(10, 14, 4);
BUTTERFLY( 3, 7, 6);
BUTTERFLY(11, 15, 6);
DO_REDUCE(5);
DO_REDUCE(7);
DO_REDUCE(13);
DO_REDUCE(15);
BUTTERFLY( 0, 2, 0);
BUTTERFLY( 4, 6, 0);
BUTTERFLY( 8, 10, 0);
BUTTERFLY(12, 14, 0);
BUTTERFLY( 1, 3, 4);
BUTTERFLY( 5, 7, 4);
BUTTERFLY( 9, 11, 4);
BUTTERFLY(13, 15, 4);
BUTTERFLY( 0, 1, 0);
BUTTERFLY( 2, 3, 0);
BUTTERFLY( 4, 5, 0);
BUTTERFLY( 6, 7, 0);
BUTTERFLY( 8, 9, 0);
BUTTERFLY(10, 11, 0);
BUTTERFLY(12, 13, 0);
BUTTERFLY(14, 15, 0);
DO_REDUCE_FULL_S( 0);
DO_REDUCE_FULL_S( 1);
DO_REDUCE_FULL_S( 2);
DO_REDUCE_FULL_S( 3);
DO_REDUCE_FULL_S( 4);
DO_REDUCE_FULL_S( 5);
DO_REDUCE_FULL_S( 6);
DO_REDUCE_FULL_S( 7);
DO_REDUCE_FULL_S( 8);
DO_REDUCE_FULL_S( 9);
DO_REDUCE_FULL_S(10);
DO_REDUCE_FULL_S(11);
DO_REDUCE_FULL_S(12);
DO_REDUCE_FULL_S(13);
DO_REDUCE_FULL_S(14);
DO_REDUCE_FULL_S(15);
#undef X
#undef DO_REDUCE
#undef DO_REDUCE_FULL_S
#undef BUTTERFLY
}
__device__ __forceinline__ void FFT_64(int *y) {
/*
* FFT_64 using w=46 as 64th root of unity
* decimation in frequency (DIF) radix-8 NTT.
* Output data is in revbin_permuted order.
*/
int i;
/*
* Begin with 8 parallels DIF FFT_8.
*/
#pragma unroll 8
for (i=0; i<8; i++) {
FFT_8(y+i,8);
}
/*
* Multiply by twiddle factors
*/
#pragma unroll 56
for (i=8; i<64; i++)
if (i & 7) y[i] = REDUCE(y[i]*c_FFT64_8_8_Twiddle[i]);
/*
* Finish with 8 paralles DIF FFT_8.
*/
#pragma unroll 8
for (i=0; i<8; i++) {
FFT_8(y+8*i,1);
}
}
__device__ __forceinline__ void FFT_128_halfzero(int *y) {
/*
* FFT_128 using w=139 as 128th root of unity.
* Decimation in frequency (DIF) NTT.
* Output data is in revbin_permuted order.
* In place.
*/
const int tmp = y[63];
int i;
#pragma unroll 63
for (i=0; i<63; i++)
y[64+i] = REDUCE(y[i] * c_FFT128_2_64_Twiddle[i]);
/* handle X^127 */
y[63] = REDUCE(tmp + 1);
y[127] = REDUCE((tmp - 1) * c_FFT128_2_64_Twiddle[63]);
FFT_64(y);
FFT_64(y+64);
}
__device__ __forceinline__ void FFT_128_full(int *y) {
int i;
#pragma unroll 16
for (i=0; i<16; i++) {
FFT_8(y+i,16);
}
#pragma unroll 128
for (i=0; i<128; i++)
/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i]);
#pragma unroll 8
for (i=0; i<8; i++) {
FFT_16(y+16*i,1);
}
}
__device__ __forceinline__ void FFT_256_halfzero(int *y) {
int i;
/*
* FFT_256 using w=41 as 256th root of unity.
* Decimation in frequency (DIF) NTT.
* Output data is in revbin_permuted order.
* In place.
*/
const int tmp = y[127];
#pragma unroll 127
for (i=0; i<127; i++)
y[128+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[i]);
/* handle X^255 with an additionnal butterfly */
y[127] = REDUCE(tmp + 1);
y[255] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]);
FFT_128_full(y);
FFT_128_full(y+128);
}
__device__ __forceinline__ void SIMD_Compress(uint32_t A[128], const int *expanded, const uint32_t *M) {
uint32_t IV[4][8];
int i;
/* Save the chaining value for the feed-forward */
#pragma unroll 8
for(i=0; i<8; i++) {
IV[0][i] = A[i];
IV[1][i] = (&A[8])[i];
IV[2][i] = (&A[16])[i];
IV[3][i] = (&A[24])[i];
}
/* XOR the message to the chaining value */
/* we can XOR word-by-word */
{
#pragma unroll 8
for(i=0; i<8; i++) {
A[i] ^= M[i];
(&A[8])[i] ^= M[8+i];
}
}
/* Run the feistel ladders with the expanded message */
{
Round8(A, expanded, 0, 3, 23, 17, 27);
Round8(A, expanded, 1, 28, 19, 22, 7);
Round8(A, expanded, 2, 29, 9, 15, 5);
Round8(A, expanded, 3, 4, 13, 10, 25);
STEP8_IF(IV[0], 32, 4, 13, A, &A[8], &A[16], &A[24]);
STEP8_IF(IV[1], 33, 13, 10, &A[24], A, &A[8], &A[16]);
STEP8_IF(IV[2], 34, 10, 25, &A[16], &A[24], A, &A[8]);
STEP8_IF(IV[3], 35, 25, 4, &A[8], &A[16], &A[24], A);
}
}
/***************************************************/
__device__ __forceinline__ void SIMDHash(const uint32_t *data, uint32_t *hashval) {
uint32_t A[128];
int i;
uint32_t buffer[16];
#pragma unroll 32
for (i=0; i < 32; i++) A[i] = c_IV_512[i];
#pragma unroll 16
for (i=0; i < 16; i++) buffer[i] = data[i];
/* Message Expansion using Number Theoretical Transform similar to FFT */
int expanded[256];
{
#pragma unroll 16
for(i=0; i<64; i+=4) {
expanded[i+0] = __byte_perm(buffer[i/4],0,0x4440);
expanded[i+1] = __byte_perm(buffer[i/4],0,0x4441);
expanded[i+2] = __byte_perm(buffer[i/4],0,0x4442);
expanded[i+3] = __byte_perm(buffer[i/4],0,0x4443);
}
#pragma unroll 16
for(i=64; i<128; i+=4) {
expanded[i+0] = 0;
expanded[i+1] = 0;
expanded[i+2] = 0;
expanded[i+3] = 0;
}
FFT_256_halfzero(expanded);
}
/* Compression Function */
SIMD_Compress(A, expanded, buffer);
/* Padding Round with known input (hence the FFT can be precomputed) */
buffer[0] = 512;
#pragma unroll 15
for (i=1; i < 16; i++) buffer[i] = 0;
SIMD_Compress(A, c_FFT, buffer);
#pragma unroll 16
for (i=0; i < 16; i++)
hashval[i] = A[i];
}
/***************************************************/
// Die Hash-Funktion
__global__ void x11_simd512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
SIMDHash(Hash, Hash);
}
}
// Setup-Funktionen
__host__ void x11_simd512_cpu_init(int thr_id, int threads)
{
cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_P4, h_P4, sizeof(h_P4), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_Q4, h_Q4, sizeof(h_Q4), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT64_8_8_Twiddle, h_FFT64_8_8_Twiddle, sizeof(h_FFT64_8_8_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT128_2_64_Twiddle, h_FFT128_2_64_Twiddle, sizeof(h_FFT128_2_64_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT128_16_8_Twiddle, h_FFT128_16_8_Twiddle, sizeof(h_FFT128_16_8_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
}
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_simd512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

262
x11/x11.cu

@ -0,0 +1,262 @@ @@ -0,0 +1,262 @@
extern "C"
{
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "miner.h"
}
#include <stdint.h>
// aus cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_bmw512_cpu_init(int thr_id, int threads);
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_init(int thr_id, int threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id, int threads);
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_keccak512_cpu_init(int thr_id, int threads);
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_init(int thr_id, int threads);
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_luffa512_cpu_init(int thr_id, int threads);
extern void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_cubehash512_cpu_init(int thr_id, int threads);
extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_shavite512_cpu_init(int thr_id, int threads);
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_simd512_cpu_init(int thr_id, int threads);
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x11_echo512_cpu_init(int thr_id, int threads);
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_check_cpu_init(int thr_id, int threads);
extern void quark_check_cpu_setTarget(const void *ptarget);
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
int order);
// X11 Hashfunktion
inline void x11hash(void *state, const void *input)
{
// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
sph_luffa512_context ctx_luffa;
sph_cubehash512_context ctx_cubehash;
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
unsigned char hash[64];
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, (void*) hash);
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_skein512_init(&ctx_skein);
// ZSKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_luffa512_init(&ctx_luffa);
// ZLUFFA;
sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
sph_luffa512_close (&ctx_luffa, (void*) hash);
#if 1
sph_cubehash512_init(&ctx_cubehash);
// ZCUBEHASH;
sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*) hash);
#endif
#if 1
sph_shavite512_init(&ctx_shavite);
// ZSHAVITE;
sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
sph_shavite512_close(&ctx_shavite, (void*) hash);
#endif
sph_simd512_init(&ctx_simd);
// ZSIMD
sph_simd512 (&ctx_simd, (const void*) hash, 64);
sph_simd512_close(&ctx_simd, (void*) hash);
#if 1
sph_echo512_init(&ctx_echo);
// ZECHO
sph_echo512 (&ctx_echo, (const void*) hash, 64);
sph_echo512_close(&ctx_echo, (void*) hash);
#endif
memcpy(state, hash, 32);
}
extern bool opt_benchmark;
extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*256; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
// Konstanten kopieren, Speicher belegen
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
x11_luffa512_cpu_init(thr_id, throughput);
x11_cubehash512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput);
x11_echo512_cpu_init(thr_id, throughput);
quark_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
quark_blake512_cpu_setBlock_80((void*)endiandata);
quark_check_cpu_setTarget(ptarget);
do {
int order = 0;
// erstes Blake512 Hash mit CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
// das ist der unbedingte Branch für BMW512
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Groestl512
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Skein512
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für JH512
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Keccak512
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Luffa512
x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Cubehash512
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für Shavite512
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für SIMD512
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// das ist der unbedingte Branch für ECHO512
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
// Scan nach Gewinner Hashes auf der GPU
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
x11hash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = foundNonce - first_nonce + 1;
return 1;
} else {
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce + 1;
return 0;
}
Loading…
Cancel
Save