mirror of
https://github.com/GOSTSec/ccminer
synced 2025-01-15 01:00:19 +00:00
v1.0 - Yo, I heard y'all like X11
This commit is contained in:
parent
e2bfb4b14f
commit
af07302b4b
12
Makefile.am
12
Makefile.am
@ -33,7 +33,10 @@ ccminer_SOURCES = elist.h miner.h compat.h \
|
|||||||
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
|
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
|
||||||
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
|
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
|
||||||
quark/cuda_quark_compactionTest.cu \
|
quark/cuda_quark_compactionTest.cu \
|
||||||
cuda_nist5.cu
|
cuda_nist5.cu \
|
||||||
|
sph/cubehash.c sph/echo.c sph/luffa.c sph/shavite.c sph/simd.c \
|
||||||
|
x11/x11.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
|
||||||
|
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
|
||||||
|
|
||||||
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
|
ccminer_LDFLAGS = $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
|
||||||
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
|
ccminer_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ @CUDA_LIBS@ @OPENMP_CFLAGS@ @LIBS@
|
||||||
@ -41,10 +44,13 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
|
|||||||
|
|
||||||
# we're now targeting all major compute architectures within one binary.
|
# we're now targeting all major compute architectures within one binary.
|
||||||
.cu.o:
|
.cu.o:
|
||||||
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||||
|
|
||||||
|
# Shavite compiles faster with 128 regs
|
||||||
|
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
|
||||||
|
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||||
|
|
||||||
# ABI requiring code modules
|
# ABI requiring code modules
|
||||||
# this module doesn't compile with Compute 2.0 unfortunately
|
|
||||||
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
|
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
|
||||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
ccMiner release 0.9 (May 06th 2014) - "Say Hi to Quark, Anime"
|
ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?"
|
||||||
-------------------------------------------------------------
|
-------------------------------------------------------------
|
||||||
|
|
||||||
***************************************************************
|
***************************************************************
|
||||||
@ -59,6 +59,8 @@ its command line interface and options.
|
|||||||
jackpot use to mine Jackpotcoin
|
jackpot use to mine Jackpotcoin
|
||||||
quark use to mine Quarkcoin
|
quark use to mine Quarkcoin
|
||||||
anime use to mine Animecoin
|
anime use to mine Animecoin
|
||||||
|
nist5 use to mine TalkCoin
|
||||||
|
x11 use to mine DarkCoin
|
||||||
|
|
||||||
-d, --devices gives a comma separated list of CUDA device IDs
|
-d, --devices gives a comma separated list of CUDA device IDs
|
||||||
to operate on. Device IDs start counting from 0!
|
to operate on. Device IDs start counting from 0!
|
||||||
@ -138,6 +140,9 @@ features.
|
|||||||
|
|
||||||
>>> RELEASE HISTORY <<<
|
>>> RELEASE HISTORY <<<
|
||||||
|
|
||||||
|
May 10th 2014 added X11, but without the bells & whistles
|
||||||
|
(no killer Groestl, SIMD hash quite slow still)
|
||||||
|
|
||||||
May 6th 2014 this adds the quark and animecoin algorithms.
|
May 6th 2014 this adds the quark and animecoin algorithms.
|
||||||
|
|
||||||
May 3rd 2014 add the MjollnirCoin hash algorithm for the upcomin
|
May 3rd 2014 add the MjollnirCoin hash algorithm for the upcomin
|
||||||
|
@ -431,6 +431,53 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
|||||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_aes.cu">
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
|
||||||
|
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
|
||||||
|
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
|
||||||
|
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_cubehash512.cu">
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_echo.cu">
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_luffa512.cu">
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_shavite512.cu">
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">128</MaxRegCount>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_simd512.cu">
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\x11.cu">
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
|
</CudaCompile>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
@ -52,6 +52,9 @@
|
|||||||
<Filter Include="Header Files\CUDA\heavy">
|
<Filter Include="Header Files\CUDA\heavy">
|
||||||
<UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
|
<UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
|
||||||
</Filter>
|
</Filter>
|
||||||
|
<Filter Include="Source Files\CUDA\x11">
|
||||||
|
<UniqueIdentifier>{dd751f2d-bfd6-42c1-8f9b-cbe94e539353}</UniqueIdentifier>
|
||||||
|
</Filter>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClCompile Include="compat\jansson\dump.c">
|
<ClCompile Include="compat\jansson\dump.c">
|
||||||
@ -313,5 +316,26 @@
|
|||||||
<CudaCompile Include="cuda_nist5.cu">
|
<CudaCompile Include="cuda_nist5.cu">
|
||||||
<Filter>Source Files\CUDA</Filter>
|
<Filter>Source Files\CUDA</Filter>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_aes.cu">
|
||||||
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_cubehash512.cu">
|
||||||
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_echo.cu">
|
||||||
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_luffa512.cu">
|
||||||
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_shavite512.cu">
|
||||||
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\cuda_x11_simd512.cu">
|
||||||
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
|
</CudaCompile>
|
||||||
|
<CudaCompile Include="x11\x11.cu">
|
||||||
|
<Filter>Source Files\CUDA\x11</Filter>
|
||||||
|
</CudaCompile>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
</Project>
|
</Project>
|
@ -1,4 +1,4 @@
|
|||||||
AC_INIT([ccminer], [2014.05.06])
|
AC_INIT([ccminer], [2014.05.10])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
14
cpu-miner.c
14
cpu-miner.c
@ -129,7 +129,8 @@ typedef enum {
|
|||||||
ALGO_JACKPOT,
|
ALGO_JACKPOT,
|
||||||
ALGO_QUARK,
|
ALGO_QUARK,
|
||||||
ALGO_ANIME,
|
ALGO_ANIME,
|
||||||
ALGO_NIST5
|
ALGO_NIST5,
|
||||||
|
ALGO_X11
|
||||||
} sha256_algos;
|
} sha256_algos;
|
||||||
|
|
||||||
static const char *algo_names[] = {
|
static const char *algo_names[] = {
|
||||||
@ -141,7 +142,8 @@ static const char *algo_names[] = {
|
|||||||
"jackpot",
|
"jackpot",
|
||||||
"quark",
|
"quark",
|
||||||
"anime",
|
"anime",
|
||||||
"nist5"
|
"nist5",
|
||||||
|
"x11"
|
||||||
};
|
};
|
||||||
|
|
||||||
bool opt_debug = false;
|
bool opt_debug = false;
|
||||||
@ -212,6 +214,7 @@ Options:\n\
|
|||||||
quark Quark hash\n\
|
quark Quark hash\n\
|
||||||
anime Animecoin hash\n\
|
anime Animecoin hash\n\
|
||||||
nist5 NIST5 (TalkCoin) hash\n\
|
nist5 NIST5 (TalkCoin) hash\n\
|
||||||
|
x11 X11 (DarkCoin) hash\n\
|
||||||
-d, --devices takes a comma separated list of CUDA devices to use.\n\
|
-d, --devices takes a comma separated list of CUDA devices to use.\n\
|
||||||
Device IDs start counting from 0! Alternatively takes\n\
|
Device IDs start counting from 0! Alternatively takes\n\
|
||||||
string names of your cards like gtx780ti or gt640#2\n\
|
string names of your cards like gtx780ti or gt640#2\n\
|
||||||
@ -901,6 +904,11 @@ static void *miner_thread(void *userdata)
|
|||||||
max_nonce, &hashes_done);
|
max_nonce, &hashes_done);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case ALGO_X11:
|
||||||
|
rc = scanhash_x11(thr_id, work.data, work.target,
|
||||||
|
max_nonce, &hashes_done);
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
/* should never happen */
|
/* should never happen */
|
||||||
goto out;
|
goto out;
|
||||||
@ -1453,7 +1461,7 @@ static void signal_handler(int sig)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define PROGRAM_VERSION "0.9"
|
#define PROGRAM_VERSION "1.0"
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
struct thr_info *thr;
|
struct thr_info *thr;
|
||||||
|
@ -152,7 +152,7 @@
|
|||||||
#define PACKAGE_NAME "ccminer"
|
#define PACKAGE_NAME "ccminer"
|
||||||
|
|
||||||
/* Define to the full name and version of this package. */
|
/* Define to the full name and version of this package. */
|
||||||
#define PACKAGE_STRING "ccminer 2014.05.06"
|
#define PACKAGE_STRING "ccminer 2014.05.10"
|
||||||
|
|
||||||
/* Define to the one symbol short name of this package. */
|
/* Define to the one symbol short name of this package. */
|
||||||
#undef PACKAGE_TARNAME
|
#undef PACKAGE_TARNAME
|
||||||
@ -161,7 +161,7 @@
|
|||||||
#undef PACKAGE_URL
|
#undef PACKAGE_URL
|
||||||
|
|
||||||
/* Define to the version of this package. */
|
/* Define to the version of this package. */
|
||||||
#define PACKAGE_VERSION "2014.05.06"
|
#define PACKAGE_VERSION "2014.05.10"
|
||||||
|
|
||||||
/* If using the C implementation of alloca, define if you know the
|
/* If using the C implementation of alloca, define if you know the
|
||||||
direction of stack growth for your system; otherwise it will be
|
direction of stack growth for your system; otherwise it will be
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
|
// Auf Myriadcoin spezialisierte Version von Groestl
|
||||||
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include "cuda_runtime.h"
|
#include "cuda_runtime.h"
|
||||||
|
4
miner.h
4
miner.h
@ -235,6 +235,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata,
|
|||||||
const uint32_t *ptarget, uint32_t max_nonce,
|
const uint32_t *ptarget, uint32_t max_nonce,
|
||||||
unsigned long *hashes_done);
|
unsigned long *hashes_done);
|
||||||
|
|
||||||
|
extern int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||||
|
const uint32_t *ptarget, uint32_t max_nonce,
|
||||||
|
unsigned long *hashes_done);
|
||||||
|
|
||||||
extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
|
extern void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
|
||||||
extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
|
extern void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
|
||||||
extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len);
|
extern void groestlcoin_hash(unsigned char* output, const unsigned char* input, int len);
|
||||||
|
402
x11/cuda_x11_aes.cu
Normal file
402
x11/cuda_x11_aes.cu
Normal file
@ -0,0 +1,402 @@
|
|||||||
|
|
||||||
|
/* AES Helper for inline-usage from SPH */
|
||||||
|
#define AESx(x) SPH_C32(x)
|
||||||
|
|
||||||
|
static const uint32_t h_AES0[256] = {
|
||||||
|
AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6),
|
||||||
|
AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591),
|
||||||
|
AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56),
|
||||||
|
AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC),
|
||||||
|
AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA),
|
||||||
|
AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB),
|
||||||
|
AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45),
|
||||||
|
AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B),
|
||||||
|
AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C),
|
||||||
|
AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83),
|
||||||
|
AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9),
|
||||||
|
AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A),
|
||||||
|
AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D),
|
||||||
|
AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F),
|
||||||
|
AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF),
|
||||||
|
AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA),
|
||||||
|
AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34),
|
||||||
|
AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B),
|
||||||
|
AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D),
|
||||||
|
AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413),
|
||||||
|
AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1),
|
||||||
|
AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6),
|
||||||
|
AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972),
|
||||||
|
AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85),
|
||||||
|
AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED),
|
||||||
|
AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511),
|
||||||
|
AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE),
|
||||||
|
AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B),
|
||||||
|
AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05),
|
||||||
|
AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1),
|
||||||
|
AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142),
|
||||||
|
AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF),
|
||||||
|
AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3),
|
||||||
|
AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E),
|
||||||
|
AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A),
|
||||||
|
AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6),
|
||||||
|
AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3),
|
||||||
|
AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B),
|
||||||
|
AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428),
|
||||||
|
AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD),
|
||||||
|
AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14),
|
||||||
|
AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8),
|
||||||
|
AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4),
|
||||||
|
AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2),
|
||||||
|
AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA),
|
||||||
|
AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949),
|
||||||
|
AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF),
|
||||||
|
AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810),
|
||||||
|
AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C),
|
||||||
|
AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697),
|
||||||
|
AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E),
|
||||||
|
AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F),
|
||||||
|
AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC),
|
||||||
|
AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C),
|
||||||
|
AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969),
|
||||||
|
AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27),
|
||||||
|
AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122),
|
||||||
|
AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433),
|
||||||
|
AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9),
|
||||||
|
AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5),
|
||||||
|
AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A),
|
||||||
|
AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0),
|
||||||
|
AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E),
|
||||||
|
AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C)
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint32_t h_AES1[256] = {
|
||||||
|
AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D),
|
||||||
|
AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154),
|
||||||
|
AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D),
|
||||||
|
AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A),
|
||||||
|
AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87),
|
||||||
|
AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B),
|
||||||
|
AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA),
|
||||||
|
AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B),
|
||||||
|
AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A),
|
||||||
|
AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F),
|
||||||
|
AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908),
|
||||||
|
AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F),
|
||||||
|
AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E),
|
||||||
|
AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5),
|
||||||
|
AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D),
|
||||||
|
AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F),
|
||||||
|
AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E),
|
||||||
|
AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB),
|
||||||
|
AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE),
|
||||||
|
AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397),
|
||||||
|
AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C),
|
||||||
|
AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED),
|
||||||
|
AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B),
|
||||||
|
AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A),
|
||||||
|
AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16),
|
||||||
|
AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194),
|
||||||
|
AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81),
|
||||||
|
AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3),
|
||||||
|
AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A),
|
||||||
|
AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104),
|
||||||
|
AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263),
|
||||||
|
AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D),
|
||||||
|
AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F),
|
||||||
|
AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39),
|
||||||
|
AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47),
|
||||||
|
AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695),
|
||||||
|
AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F),
|
||||||
|
AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83),
|
||||||
|
AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C),
|
||||||
|
AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76),
|
||||||
|
AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E),
|
||||||
|
AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4),
|
||||||
|
AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6),
|
||||||
|
AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B),
|
||||||
|
AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7),
|
||||||
|
AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0),
|
||||||
|
AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25),
|
||||||
|
AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018),
|
||||||
|
AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72),
|
||||||
|
AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751),
|
||||||
|
AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21),
|
||||||
|
AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85),
|
||||||
|
AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA),
|
||||||
|
AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12),
|
||||||
|
AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0),
|
||||||
|
AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9),
|
||||||
|
AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233),
|
||||||
|
AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7),
|
||||||
|
AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920),
|
||||||
|
AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A),
|
||||||
|
AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17),
|
||||||
|
AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8),
|
||||||
|
AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11),
|
||||||
|
AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A)
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint32_t h_AES2[256] = {
|
||||||
|
AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B),
|
||||||
|
AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5),
|
||||||
|
AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B),
|
||||||
|
AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76),
|
||||||
|
AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D),
|
||||||
|
AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0),
|
||||||
|
AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF),
|
||||||
|
AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0),
|
||||||
|
AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26),
|
||||||
|
AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC),
|
||||||
|
AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1),
|
||||||
|
AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15),
|
||||||
|
AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3),
|
||||||
|
AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A),
|
||||||
|
AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2),
|
||||||
|
AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75),
|
||||||
|
AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A),
|
||||||
|
AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0),
|
||||||
|
AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3),
|
||||||
|
AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784),
|
||||||
|
AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED),
|
||||||
|
AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B),
|
||||||
|
AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39),
|
||||||
|
AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF),
|
||||||
|
AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB),
|
||||||
|
AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485),
|
||||||
|
AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F),
|
||||||
|
AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8),
|
||||||
|
AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F),
|
||||||
|
AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5),
|
||||||
|
AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321),
|
||||||
|
AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2),
|
||||||
|
AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC),
|
||||||
|
AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917),
|
||||||
|
AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D),
|
||||||
|
AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573),
|
||||||
|
AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC),
|
||||||
|
AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388),
|
||||||
|
AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14),
|
||||||
|
AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB),
|
||||||
|
AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A),
|
||||||
|
AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C),
|
||||||
|
AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662),
|
||||||
|
AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79),
|
||||||
|
AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D),
|
||||||
|
AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9),
|
||||||
|
AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA),
|
||||||
|
AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808),
|
||||||
|
AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E),
|
||||||
|
AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6),
|
||||||
|
AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F),
|
||||||
|
AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A),
|
||||||
|
AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66),
|
||||||
|
AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E),
|
||||||
|
AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9),
|
||||||
|
AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E),
|
||||||
|
AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311),
|
||||||
|
AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794),
|
||||||
|
AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9),
|
||||||
|
AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF),
|
||||||
|
AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D),
|
||||||
|
AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868),
|
||||||
|
AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F),
|
||||||
|
AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16)
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uint32_t h_AES3[256] = {
|
||||||
|
AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B),
|
||||||
|
AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5),
|
||||||
|
AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B),
|
||||||
|
AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676),
|
||||||
|
AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D),
|
||||||
|
AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0),
|
||||||
|
AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF),
|
||||||
|
AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0),
|
||||||
|
AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626),
|
||||||
|
AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC),
|
||||||
|
AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1),
|
||||||
|
AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515),
|
||||||
|
AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3),
|
||||||
|
AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A),
|
||||||
|
AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2),
|
||||||
|
AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575),
|
||||||
|
AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A),
|
||||||
|
AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0),
|
||||||
|
AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3),
|
||||||
|
AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484),
|
||||||
|
AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED),
|
||||||
|
AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B),
|
||||||
|
AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939),
|
||||||
|
AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF),
|
||||||
|
AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB),
|
||||||
|
AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585),
|
||||||
|
AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F),
|
||||||
|
AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8),
|
||||||
|
AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F),
|
||||||
|
AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5),
|
||||||
|
AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121),
|
||||||
|
AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2),
|
||||||
|
AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC),
|
||||||
|
AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717),
|
||||||
|
AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D),
|
||||||
|
AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373),
|
||||||
|
AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC),
|
||||||
|
AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888),
|
||||||
|
AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414),
|
||||||
|
AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB),
|
||||||
|
AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A),
|
||||||
|
AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C),
|
||||||
|
AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262),
|
||||||
|
AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979),
|
||||||
|
AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D),
|
||||||
|
AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9),
|
||||||
|
AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA),
|
||||||
|
AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808),
|
||||||
|
AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E),
|
||||||
|
AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6),
|
||||||
|
AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F),
|
||||||
|
AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A),
|
||||||
|
AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666),
|
||||||
|
AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E),
|
||||||
|
AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9),
|
||||||
|
AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E),
|
||||||
|
AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111),
|
||||||
|
AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494),
|
||||||
|
AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9),
|
||||||
|
AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF),
|
||||||
|
AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D),
|
||||||
|
AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868),
|
||||||
|
AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F),
|
||||||
|
AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616)
|
||||||
|
};
|
||||||
|
|
||||||
|
static __constant__ uint32_t d_AES0[256];
|
||||||
|
static __constant__ uint32_t d_AES1[256];
|
||||||
|
static __constant__ uint32_t d_AES2[256];
|
||||||
|
static __constant__ uint32_t d_AES3[256];
|
||||||
|
|
||||||
|
static void aes_cpu_init()
|
||||||
|
{
|
||||||
|
cudaMemcpyToSymbol( d_AES0,
|
||||||
|
h_AES0,
|
||||||
|
sizeof(h_AES0),
|
||||||
|
0, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
cudaMemcpyToSymbol( d_AES1,
|
||||||
|
h_AES1,
|
||||||
|
sizeof(h_AES1),
|
||||||
|
0, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
cudaMemcpyToSymbol( d_AES2,
|
||||||
|
h_AES2,
|
||||||
|
sizeof(h_AES2),
|
||||||
|
0, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
cudaMemcpyToSymbol( d_AES3,
|
||||||
|
h_AES3,
|
||||||
|
sizeof(h_AES3),
|
||||||
|
0, cudaMemcpyHostToDevice);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ void aes_gpu_init(uint32_t *sharedMemory)
|
||||||
|
{
|
||||||
|
if(threadIdx.x < 256)
|
||||||
|
{
|
||||||
|
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
|
||||||
|
sharedMemory[threadIdx.x+256] = d_AES1[threadIdx.x];
|
||||||
|
sharedMemory[threadIdx.x+512] = d_AES2[threadIdx.x];
|
||||||
|
sharedMemory[threadIdx.x+768] = d_AES3[threadIdx.x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ void aes_round(
|
||||||
|
const uint32_t *sharedMemory,
|
||||||
|
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
|
||||||
|
uint32_t k0,
|
||||||
|
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
||||||
|
{
|
||||||
|
uint32_t idx0, idx1, idx2, idx3;
|
||||||
|
|
||||||
|
idx0 = __byte_perm(x0, 0, 0x4440);
|
||||||
|
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
|
||||||
|
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
|
||||||
|
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
|
||||||
|
y0 =sharedMemory[idx0] ^
|
||||||
|
sharedMemory[idx1] ^
|
||||||
|
sharedMemory[idx2] ^
|
||||||
|
sharedMemory[idx3] ^
|
||||||
|
k0;
|
||||||
|
|
||||||
|
idx0 = __byte_perm(x1, 0, 0x4440);
|
||||||
|
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
|
||||||
|
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
|
||||||
|
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
|
||||||
|
|
||||||
|
y1 =sharedMemory[idx0] ^
|
||||||
|
sharedMemory[idx1] ^
|
||||||
|
sharedMemory[idx2] ^
|
||||||
|
sharedMemory[idx3]; // ^k3
|
||||||
|
|
||||||
|
idx0 = __byte_perm(x2, 0, 0x4440);
|
||||||
|
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
|
||||||
|
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
|
||||||
|
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
|
||||||
|
y2 =sharedMemory[idx0] ^
|
||||||
|
sharedMemory[idx1] ^
|
||||||
|
sharedMemory[idx2] ^
|
||||||
|
sharedMemory[idx3]; // ^k2
|
||||||
|
|
||||||
|
idx0 = __byte_perm(x3, 0, 0x4440);
|
||||||
|
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
|
||||||
|
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
|
||||||
|
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
|
||||||
|
y3 =sharedMemory[idx0] ^
|
||||||
|
sharedMemory[idx1] ^
|
||||||
|
sharedMemory[idx2] ^
|
||||||
|
sharedMemory[idx3]; // ^k3
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ void aes_round(
|
||||||
|
const uint32_t *sharedMemory,
|
||||||
|
uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
|
||||||
|
uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
|
||||||
|
{
|
||||||
|
uint32_t idx0, idx1, idx2, idx3;
|
||||||
|
|
||||||
|
idx0 = __byte_perm(x0, 0, 0x4440);
|
||||||
|
idx1 = __byte_perm(x1, 0, 0x4441) + 256;
|
||||||
|
idx2 = __byte_perm(x2, 0, 0x4442) + 512;
|
||||||
|
idx3 = __byte_perm(x3, 0, 0x4443) + 768;
|
||||||
|
y0 =sharedMemory[idx0] ^
|
||||||
|
sharedMemory[idx1] ^
|
||||||
|
sharedMemory[idx2] ^
|
||||||
|
sharedMemory[idx3];
|
||||||
|
|
||||||
|
idx0 = __byte_perm(x1, 0, 0x4440);
|
||||||
|
idx1 = __byte_perm(x2, 0, 0x4441) + 256;
|
||||||
|
idx2 = __byte_perm(x3, 0, 0x4442) + 512;
|
||||||
|
idx3 = __byte_perm(x0, 0, 0x4443) + 768;
|
||||||
|
|
||||||
|
y1 =sharedMemory[idx0] ^
|
||||||
|
sharedMemory[idx1] ^
|
||||||
|
sharedMemory[idx2] ^
|
||||||
|
sharedMemory[idx3]; // ^k3
|
||||||
|
|
||||||
|
idx0 = __byte_perm(x2, 0, 0x4440);
|
||||||
|
idx1 = __byte_perm(x3, 0, 0x4441) + 256;
|
||||||
|
idx2 = __byte_perm(x0, 0, 0x4442) + 512;
|
||||||
|
idx3 = __byte_perm(x1, 0, 0x4443) + 768;
|
||||||
|
y2 =sharedMemory[idx0] ^
|
||||||
|
sharedMemory[idx1] ^
|
||||||
|
sharedMemory[idx2] ^
|
||||||
|
sharedMemory[idx3]; // ^k2
|
||||||
|
|
||||||
|
idx0 = __byte_perm(x3, 0, 0x4440);
|
||||||
|
idx1 = __byte_perm(x0, 0, 0x4441) + 256;
|
||||||
|
idx2 = __byte_perm(x1, 0, 0x4442) + 512;
|
||||||
|
idx3 = __byte_perm(x2, 0, 0x4443) + 768;
|
||||||
|
y3 =sharedMemory[idx0] ^
|
||||||
|
sharedMemory[idx1] ^
|
||||||
|
sharedMemory[idx2] ^
|
||||||
|
sharedMemory[idx3]; // ^k3
|
||||||
|
}
|
315
x11/cuda_x11_cubehash512.cu
Normal file
315
x11/cuda_x11_cubehash512.cu
Normal file
@ -0,0 +1,315 @@
|
|||||||
|
// aus heavy.cu
|
||||||
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||||
|
|
||||||
|
typedef unsigned char BitSequence;
|
||||||
|
typedef unsigned long long DataLength;
|
||||||
|
|
||||||
|
typedef unsigned char uint8_t;
|
||||||
|
typedef unsigned int uint32_t;
|
||||||
|
typedef unsigned long long uint64_t;
|
||||||
|
|
||||||
|
static __device__ uint32_t cuda_swab32(uint32_t x)
|
||||||
|
{
|
||||||
|
return __byte_perm(x, 0, 0x0123);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef unsigned char BitSequence;
|
||||||
|
typedef unsigned long long DataLength;
|
||||||
|
|
||||||
|
#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
|
||||||
|
#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
|
||||||
|
|
||||||
|
typedef unsigned int uint32_t; /* must be exactly 32 bits */
|
||||||
|
|
||||||
|
#define ROTATEUPWARDS7(a) (((a) << 7) | ((a) >> 25))
|
||||||
|
#define ROTATEUPWARDS11(a) (((a) << 11) | ((a) >> 21))
|
||||||
|
#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
|
||||||
|
|
||||||
|
__constant__ uint32_t c_IV_512[32];
|
||||||
|
static const uint32_t h_IV_512[32] = {
|
||||||
|
0x2AEA2A61, 0x50F494D4, 0x2D538B8B,
|
||||||
|
0x4167D83E, 0x3FEE2313, 0xC701CF8C,
|
||||||
|
0xCC39968E, 0x50AC5695, 0x4D42C787,
|
||||||
|
0xA647A8B3, 0x97CF0BEF, 0x825B4537,
|
||||||
|
0xEEF864D2, 0xF22090C4, 0xD0E5CD33,
|
||||||
|
0xA23911AE, 0xFCD398D9, 0x148FE485,
|
||||||
|
0x1B017BEF, 0xB6444532, 0x6A536159,
|
||||||
|
0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
|
||||||
|
0xD65C8A2B, 0xA5A70E75, 0xB1C62456,
|
||||||
|
0xBC796576, 0x1921C8F7, 0xE7989AF1,
|
||||||
|
0x7795D246, 0xD43E3B44
|
||||||
|
};
|
||||||
|
|
||||||
|
static __device__ void rrounds(uint32_t x[2][2][2][2][2])
|
||||||
|
{
|
||||||
|
int r;
|
||||||
|
int j;
|
||||||
|
int k;
|
||||||
|
int l;
|
||||||
|
int m;
|
||||||
|
|
||||||
|
//#pragma unroll 16
|
||||||
|
for (r = 0;r < CUBEHASH_ROUNDS;++r) {
|
||||||
|
|
||||||
|
/* "add x_0jklm into x_1jklmn modulo 2^32" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[1][j][k][l][m] += x[0][j][k][l][m];
|
||||||
|
|
||||||
|
/* "rotate x_0jklm upwards by 7 bits" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
|
||||||
|
|
||||||
|
/* "swap x_00klm with x_01klm" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
SWAP(x[0][0][k][l][m],x[0][1][k][l][m])
|
||||||
|
|
||||||
|
/* "xor x_1jklm into x_0jklm" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[0][j][k][l][m] ^= x[1][j][k][l][m];
|
||||||
|
|
||||||
|
/* "swap x_1jk0m with x_1jk1m" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
SWAP(x[1][j][k][0][m],x[1][j][k][1][m])
|
||||||
|
|
||||||
|
/* "add x_0jklm into x_1jklm modulo 2^32" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[1][j][k][l][m] += x[0][j][k][l][m];
|
||||||
|
|
||||||
|
/* "rotate x_0jklm upwards by 11 bits" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
|
||||||
|
|
||||||
|
/* "swap x_0j0lm with x_0j1lm" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
SWAP(x[0][j][0][l][m],x[0][j][1][l][m])
|
||||||
|
|
||||||
|
/* "xor x_1jklm into x_0jklm" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[0][j][k][l][m] ^= x[1][j][k][l][m];
|
||||||
|
|
||||||
|
/* "swap x_1jkl0 with x_1jkl1" */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
SWAP(x[1][j][k][l][0],x[1][j][k][l][1])
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static __device__ void block_tox(uint32_t block[16], uint32_t x[2][2][2][2][2])
|
||||||
|
{
|
||||||
|
int k;
|
||||||
|
int l;
|
||||||
|
int m;
|
||||||
|
uint32_t *in = block;
|
||||||
|
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[0][0][k][l][m] ^= *in++;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ void hash_fromx(uint32_t hash[16], uint32_t x[2][2][2][2][2])
|
||||||
|
{
|
||||||
|
int j;
|
||||||
|
int k;
|
||||||
|
int l;
|
||||||
|
int m;
|
||||||
|
uint32_t *out = hash;
|
||||||
|
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
*out++ = x[0][j][k][l][m];
|
||||||
|
}
|
||||||
|
|
||||||
|
void __device__ Init(uint32_t x[2][2][2][2][2])
|
||||||
|
{
|
||||||
|
int i,j,k,l,m;
|
||||||
|
#if 0
|
||||||
|
/* "the first three state words x_00000, x_00001, x_00010" */
|
||||||
|
/* "are set to the integers h/8, b, r respectively." */
|
||||||
|
/* "the remaining state words are set to 0." */
|
||||||
|
#pragma unroll 2
|
||||||
|
for (i = 0;i < 2;++i)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[i][j][k][l][m] = 0;
|
||||||
|
x[0][0][0][0][0] = 512/8;
|
||||||
|
x[0][0][0][0][1] = CUBEHASH_BLOCKBYTES;
|
||||||
|
x[0][0][0][1][0] = CUBEHASH_ROUNDS;
|
||||||
|
|
||||||
|
/* "the state is then transformed invertibly through 10r identical rounds */
|
||||||
|
for (i = 0;i < 10;++i) rrounds(x);
|
||||||
|
#else
|
||||||
|
uint32_t *iv = c_IV_512;
|
||||||
|
|
||||||
|
#pragma unroll 2
|
||||||
|
for (i = 0;i < 2;++i)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (j = 0;j < 2;++j)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (k = 0;k < 2;++k)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (l = 0;l < 2;++l)
|
||||||
|
#pragma unroll 2
|
||||||
|
for (m = 0;m < 2;++m)
|
||||||
|
x[i][j][k][l][m] = *iv++;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void __device__ Update32(uint32_t x[2][2][2][2][2], const BitSequence *data)
|
||||||
|
{
|
||||||
|
/* "xor the block into the first b bytes of the state" */
|
||||||
|
/* "and then transform the state invertibly through r identical rounds" */
|
||||||
|
block_tox((uint32_t*)data, x);
|
||||||
|
rrounds(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
void __device__ Final(uint32_t x[2][2][2][2][2], BitSequence *hashval)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* "the integer 1 is xored into the last state word x_11111" */
|
||||||
|
x[1][1][1][1][1] ^= 1;
|
||||||
|
|
||||||
|
/* "the state is then transformed invertibly through 10r identical rounds" */
|
||||||
|
#pragma unroll 10
|
||||||
|
for (i = 0;i < 10;++i) rrounds(x);
|
||||||
|
|
||||||
|
/* "output the first h/8 bytes of the state" */
|
||||||
|
hash_fromx((uint32_t*)hashval, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/***************************************************/
|
||||||
|
// Die Hash-Funktion
|
||||||
|
__global__ void x11_cubehash512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
||||||
|
{
|
||||||
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||||
|
|
||||||
|
int hashPosition = nounce - startNounce;
|
||||||
|
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
|
||||||
|
|
||||||
|
uint32_t x[2][2][2][2][2];
|
||||||
|
Init(x);
|
||||||
|
|
||||||
|
// erste Hälfte des Hashes (32 bytes)
|
||||||
|
Update32(x, (const BitSequence*)Hash);
|
||||||
|
|
||||||
|
// zweite Hälfte des Hashes (32 bytes)
|
||||||
|
Update32(x, (const BitSequence*)(Hash+8));
|
||||||
|
|
||||||
|
// Padding Block
|
||||||
|
uint32_t last[8];
|
||||||
|
last[0] = 0x80;
|
||||||
|
#pragma unroll 7
|
||||||
|
for (int i=1; i < 8; i++) last[i] = 0;
|
||||||
|
Update32(x, (const BitSequence*)last);
|
||||||
|
|
||||||
|
Final(x, (BitSequence*)Hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Setup-Funktionen
|
||||||
|
__host__ void x11_cubehash512_cpu_init(int thr_id, int threads)
|
||||||
|
{
|
||||||
|
cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||||
|
{
|
||||||
|
const int threadsperblock = 256;
|
||||||
|
|
||||||
|
// berechne wie viele Thread Blocks wir brauchen
|
||||||
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
|
size_t shared_size = 0;
|
||||||
|
|
||||||
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
|
||||||
|
x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
|
MyStreamSynchronize(NULL, order, thr_id);
|
||||||
|
}
|
||||||
|
|
232
x11/cuda_x11_echo.cu
Normal file
232
x11/cuda_x11_echo.cu
Normal file
@ -0,0 +1,232 @@
|
|||||||
|
#include <cuda.h>
|
||||||
|
#include "cuda_runtime.h"
|
||||||
|
#include "device_launch_parameters.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <memory.h>
|
||||||
|
|
||||||
|
// Folgende Definitionen später durch header ersetzen
|
||||||
|
typedef unsigned char uint8_t;
|
||||||
|
typedef unsigned int uint32_t;
|
||||||
|
typedef unsigned long long uint64_t;
|
||||||
|
|
||||||
|
// das Hi Word aus einem 64 Bit Typen extrahieren
|
||||||
|
static __device__ uint32_t HIWORD(const uint64_t &x) {
|
||||||
|
#if __CUDA_ARCH__ >= 130
|
||||||
|
return (uint32_t)__double2hiint(__longlong_as_double(x));
|
||||||
|
#else
|
||||||
|
return (uint32_t)(x >> 32);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// das Lo Word aus einem 64 Bit Typen extrahieren
|
||||||
|
static __device__ uint32_t LOWORD(const uint64_t &x) {
|
||||||
|
#if __CUDA_ARCH__ >= 130
|
||||||
|
return (uint32_t)__double2loint(__longlong_as_double(x));
|
||||||
|
#else
|
||||||
|
return (uint32_t)(x & 0xFFFFFFFFULL);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SPH_C64(x) ((uint64_t)(x ## ULL))
|
||||||
|
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||||
|
|
||||||
|
// aus heavy.cu
|
||||||
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||||
|
|
||||||
|
#include "cuda_x11_aes.cu"
|
||||||
|
|
||||||
|
__device__ __forceinline__ void AES_2ROUND(
|
||||||
|
const uint32_t* __restrict__ sharedMemory,
|
||||||
|
uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
|
||||||
|
uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
|
||||||
|
{
|
||||||
|
uint32_t y0, y1, y2, y3;
|
||||||
|
|
||||||
|
aes_round(sharedMemory,
|
||||||
|
x0, x1, x2, x3,
|
||||||
|
k0,
|
||||||
|
y0, y1, y2, y3);
|
||||||
|
|
||||||
|
aes_round(sharedMemory,
|
||||||
|
y0, y1, y2, y3,
|
||||||
|
x0, x1, x2, x3);
|
||||||
|
|
||||||
|
// hier werden wir ein carry brauchen (oder auch nicht)
|
||||||
|
k0++;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void cuda_echo_round(
|
||||||
|
const uint32_t *sharedMemory,
|
||||||
|
uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3,
|
||||||
|
uint32_t *W, int round)
|
||||||
|
{
|
||||||
|
// W hat 16*4 als Abmaße
|
||||||
|
|
||||||
|
// Big Sub Words
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int i=0;i<16;i++)
|
||||||
|
{
|
||||||
|
int idx = i<<2; // *4
|
||||||
|
AES_2ROUND(sharedMemory,
|
||||||
|
W[idx+0], W[idx+1], W[idx+2], W[idx+3],
|
||||||
|
k0, k1, k2, k3);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Shift Rows
|
||||||
|
#pragma unroll 4
|
||||||
|
for(int i=0;i<4;i++)
|
||||||
|
{
|
||||||
|
uint32_t t;
|
||||||
|
|
||||||
|
/// 1, 5, 9, 13
|
||||||
|
t = W[4 + i];
|
||||||
|
W[4 + i] = W[20 + i];
|
||||||
|
W[20 + i] = W[36 + i];
|
||||||
|
W[36 + i] = W[52 + i];
|
||||||
|
W[52 + i] = t;
|
||||||
|
|
||||||
|
// 2, 6, 10, 14
|
||||||
|
t = W[8 + i];
|
||||||
|
W[8 + i] = W[40 + i];
|
||||||
|
W[40 + i] = t;
|
||||||
|
t = W[24 + i];
|
||||||
|
W[24 + i] = W[56 + i];
|
||||||
|
W[56 + i] = t;
|
||||||
|
|
||||||
|
// 15, 11, 7, 3
|
||||||
|
t = W[60 + i];
|
||||||
|
W[60 + i] = W[44 + i];
|
||||||
|
W[44 + i] = W[28 + i];
|
||||||
|
W[28 + i] = W[12 + i];
|
||||||
|
W[12 + i] = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mix Columns
|
||||||
|
#pragma unroll 4
|
||||||
|
for(int i=0;i<4;i++) // Schleife über je 2*uint32_t
|
||||||
|
{
|
||||||
|
#pragma unroll 4
|
||||||
|
for(int j=0;j<4;j++) // Schleife über die elemnte
|
||||||
|
{
|
||||||
|
int idx = j<<2; // j*4
|
||||||
|
|
||||||
|
uint32_t a = W[ ((idx + 0)<<2) + i];
|
||||||
|
uint32_t b = W[ ((idx + 1)<<2) + i];
|
||||||
|
uint32_t c = W[ ((idx + 2)<<2) + i];
|
||||||
|
uint32_t d = W[ ((idx + 3)<<2) + i];
|
||||||
|
|
||||||
|
uint32_t ab = a ^ b;
|
||||||
|
uint32_t bc = b ^ c;
|
||||||
|
uint32_t cd = c ^ d;
|
||||||
|
|
||||||
|
uint32_t t;
|
||||||
|
t = ((ab & 0x80808080) >> 7);
|
||||||
|
uint32_t abx = t<<4 ^ t<<3 ^ t<<1 ^ t;
|
||||||
|
t = ((bc & 0x80808080) >> 7);
|
||||||
|
uint32_t bcx = t<<4 ^ t<<3 ^ t<<1 ^ t;
|
||||||
|
t = ((cd & 0x80808080) >> 7);
|
||||||
|
uint32_t cdx = t<<4 ^ t<<3 ^ t<<1 ^ t;
|
||||||
|
|
||||||
|
abx ^= ((ab & 0x7F7F7F7F) << 1);
|
||||||
|
bcx ^= ((bc & 0x7F7F7F7F) << 1);
|
||||||
|
cdx ^= ((cd & 0x7F7F7F7F) << 1);
|
||||||
|
|
||||||
|
W[ ((idx + 0)<<2) + i] = abx ^ bc ^ d;
|
||||||
|
W[ ((idx + 1)<<2) + i] = bcx ^ a ^ cd;
|
||||||
|
W[ ((idx + 2)<<2) + i] = cdx ^ ab ^ d;
|
||||||
|
W[ ((idx + 3)<<2) + i] = abx ^ bcx ^ cdx ^ ab ^ c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void x11_echo512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
||||||
|
{
|
||||||
|
__shared__ uint32_t sharedMemory[1024];
|
||||||
|
|
||||||
|
aes_gpu_init(sharedMemory);
|
||||||
|
|
||||||
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||||
|
|
||||||
|
int hashPosition = nounce - startNounce;
|
||||||
|
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
|
||||||
|
|
||||||
|
uint32_t W[64];
|
||||||
|
uint32_t k0 = 512, k1 = 0, k2 = 0, k3 = 0; // K0 = bitlen
|
||||||
|
/* Initialisierung */
|
||||||
|
#pragma unroll 8
|
||||||
|
for(int i=0;i<32;i+=4)
|
||||||
|
{
|
||||||
|
W[i + 0] = 512;
|
||||||
|
W[i + 1] = 0;
|
||||||
|
W[i + 2] = 0;
|
||||||
|
W[i + 3] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// kopiere 32-byte großen hash
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int i=0;i<16;i++)
|
||||||
|
W[i+32] = Hash[i];
|
||||||
|
W[48] = 0x80; // fest
|
||||||
|
#pragma unroll 10
|
||||||
|
for(int i=49;i<59;i++)
|
||||||
|
W[i] = 0;
|
||||||
|
W[59] = 0x02000000; // fest
|
||||||
|
W[60] = k0; // bitlen
|
||||||
|
W[61] = k1;
|
||||||
|
W[62] = k2;
|
||||||
|
W[63] = k3;
|
||||||
|
|
||||||
|
for(int i=0;i<10;i++)
|
||||||
|
{
|
||||||
|
cuda_echo_round(sharedMemory, k0, k1, k2, k3, W, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(int i=0;i<32;i+=4)
|
||||||
|
{
|
||||||
|
W[i ] ^= W[32 + i ] ^ 512;
|
||||||
|
W[i+1] ^= W[32 + i + 1];
|
||||||
|
W[i+2] ^= W[32 + i + 2];
|
||||||
|
W[i+3] ^= W[32 + i + 3];
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int i=0;i<16;i++)
|
||||||
|
W[i] ^= Hash[i];
|
||||||
|
|
||||||
|
W[8] ^= 0x10;
|
||||||
|
W[27] ^= 0x02000000;
|
||||||
|
W[28] ^= k0;
|
||||||
|
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int i=0;i<16;i++)
|
||||||
|
Hash[i] = W[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Setup-Funktionen
|
||||||
|
__host__ void x11_echo512_cpu_init(int thr_id, int threads)
|
||||||
|
{
|
||||||
|
aes_cpu_init();
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||||
|
{
|
||||||
|
const int threadsperblock = 256;
|
||||||
|
|
||||||
|
// berechne wie viele Thread Blocks wir brauchen
|
||||||
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
|
size_t shared_size = 0;
|
||||||
|
|
||||||
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
|
||||||
|
x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
|
MyStreamSynchronize(NULL, order, thr_id);
|
||||||
|
}
|
384
x11/cuda_x11_luffa512.cu
Normal file
384
x11/cuda_x11_luffa512.cu
Normal file
@ -0,0 +1,384 @@
|
|||||||
|
/*
|
||||||
|
* luffa_for_32.c
|
||||||
|
* Version 2.0 (Sep 15th 2009)
|
||||||
|
*
|
||||||
|
* Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
|
||||||
|
*
|
||||||
|
* Hitachi, Ltd. is the owner of this software and hereby grant
|
||||||
|
* the U.S. Government and any interested party the right to use
|
||||||
|
* this software for the purposes of the SHA-3 evaluation process,
|
||||||
|
* notwithstanding that this software is copyrighted.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// aus heavy.cu
|
||||||
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||||
|
|
||||||
|
typedef unsigned char BitSequence;
|
||||||
|
|
||||||
|
typedef unsigned char uint8_t;
|
||||||
|
typedef unsigned int uint32_t;
|
||||||
|
typedef unsigned long long uint64_t;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t buffer[8]; /* Buffer to be hashed */
|
||||||
|
uint32_t chainv[40]; /* Chaining values */
|
||||||
|
} hashState;
|
||||||
|
|
||||||
|
|
||||||
|
static __device__ __forceinline__ uint32_t BYTES_SWAP32(uint32_t x)
|
||||||
|
{
|
||||||
|
return __byte_perm(x, x, 0x0123);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define MULT2(a,j)\
|
||||||
|
tmp = a[7+(8*j)];\
|
||||||
|
a[7+(8*j)] = a[6+(8*j)];\
|
||||||
|
a[6+(8*j)] = a[5+(8*j)];\
|
||||||
|
a[5+(8*j)] = a[4+(8*j)];\
|
||||||
|
a[4+(8*j)] = a[3+(8*j)] ^ tmp;\
|
||||||
|
a[3+(8*j)] = a[2+(8*j)] ^ tmp;\
|
||||||
|
a[2+(8*j)] = a[1+(8*j)];\
|
||||||
|
a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
|
||||||
|
a[0+(8*j)] = tmp;
|
||||||
|
|
||||||
|
#define TWEAK(a0,a1,a2,a3,j)\
|
||||||
|
a0 = (a0<<(j))|(a0>>(32-j));\
|
||||||
|
a1 = (a1<<(j))|(a1>>(32-j));\
|
||||||
|
a2 = (a2<<(j))|(a2>>(32-j));\
|
||||||
|
a3 = (a3<<(j))|(a3>>(32-j));
|
||||||
|
|
||||||
|
#define STEP(c0,c1)\
|
||||||
|
SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
|
||||||
|
SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
|
||||||
|
MIXWORD(chainv[0],chainv[4]);\
|
||||||
|
MIXWORD(chainv[1],chainv[5]);\
|
||||||
|
MIXWORD(chainv[2],chainv[6]);\
|
||||||
|
MIXWORD(chainv[3],chainv[7]);\
|
||||||
|
ADD_CONSTANT(chainv[0],chainv[4],c0,c1);
|
||||||
|
|
||||||
|
#define SUBCRUMB(a0,a1,a2,a3,a4)\
|
||||||
|
a4 = a0;\
|
||||||
|
a0 |= a1;\
|
||||||
|
a2 ^= a3;\
|
||||||
|
a1 = ~a1;\
|
||||||
|
a0 ^= a3;\
|
||||||
|
a3 &= a4;\
|
||||||
|
a1 ^= a3;\
|
||||||
|
a3 ^= a2;\
|
||||||
|
a2 &= a0;\
|
||||||
|
a0 = ~a0;\
|
||||||
|
a2 ^= a1;\
|
||||||
|
a1 |= a3;\
|
||||||
|
a4 ^= a1;\
|
||||||
|
a3 ^= a2;\
|
||||||
|
a2 &= a1;\
|
||||||
|
a1 ^= a0;\
|
||||||
|
a0 = a4;
|
||||||
|
|
||||||
|
#define MIXWORD(a0,a4)\
|
||||||
|
a4 ^= a0;\
|
||||||
|
a0 = (a0<<2) | (a0>>(30));\
|
||||||
|
a0 ^= a4;\
|
||||||
|
a4 = (a4<<14) | (a4>>(18));\
|
||||||
|
a4 ^= a0;\
|
||||||
|
a0 = (a0<<10) | (a0>>(22));\
|
||||||
|
a0 ^= a4;\
|
||||||
|
a4 = (a4<<1) | (a4>>(31));
|
||||||
|
|
||||||
|
#define ADD_CONSTANT(a0,b0,c0,c1)\
|
||||||
|
a0 ^= c0;\
|
||||||
|
b0 ^= c1;
|
||||||
|
|
||||||
|
/* initial values of chaining variables */
|
||||||
|
__constant__ uint32_t c_IV[40];
|
||||||
|
const uint32_t h_IV[40] = {
|
||||||
|
0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
|
||||||
|
0x6e292011,0x90152df4,0xee058139,0xdef610bb,
|
||||||
|
0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
|
||||||
|
0x5d9b0557,0x8fc944b3,0xcf1ccf0e,0x746cd581,
|
||||||
|
0xf7efc89d,0x5dba5781,0x04016ce5,0xad659c05,
|
||||||
|
0x0306194f,0x666d1836,0x24aa230a,0x8b264ae7,
|
||||||
|
0x858075d5,0x36d79cce,0xe571f7d7,0x204b1f67,
|
||||||
|
0x35870c6a,0x57e9e923,0x14bcb808,0x7cde72ce,
|
||||||
|
0x6c68e9be,0x5ec41e22,0xc825b7c7,0xaffb4363,
|
||||||
|
0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
|
||||||
|
|
||||||
|
__constant__ uint32_t c_CNS[80];
|
||||||
|
uint32_t h_CNS[80] = {
|
||||||
|
0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
|
||||||
|
0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
|
||||||
|
0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
|
||||||
|
0x8f5b7882,0x26889ba7,0x96e1db12,0x9a226e9d,
|
||||||
|
0xb6de10ed,0x01685f3d,0x70f47aae,0x05a17cf4,
|
||||||
|
0x0707a3d4,0xbd09caca,0x1c1e8f51,0xf4272b28,
|
||||||
|
0x707a3d45,0x144ae5cc,0xaeb28562,0xfaa7ae2b,
|
||||||
|
0xbaca1589,0x2e48f1c1,0x40a46f3e,0xb923c704,
|
||||||
|
0xfc20d9d2,0xe25e72c1,0x34552e25,0xe623bb72,
|
||||||
|
0x7ad8818f,0x5c58a4a4,0x8438764a,0x1e38e2e7,
|
||||||
|
0xbb6de032,0x78e38b9d,0xedb780c8,0x27586719,
|
||||||
|
0xd9847356,0x36eda57f,0xa2c78434,0x703aace7,
|
||||||
|
0xb213afa5,0xe028c9bf,0xc84ebe95,0x44756f91,
|
||||||
|
0x4e608a22,0x7e8fce32,0x56d858fe,0x956548be,
|
||||||
|
0x343b138f,0xfe191be2,0xd0ec4e3d,0x3cb226e5,
|
||||||
|
0x2ceb4882,0x5944a28e,0xb3ad2208,0xa1c4c355,
|
||||||
|
0xf0d2e9e3,0x5090d577,0xac11d7fa,0x2d1925ab,
|
||||||
|
0x1bcb66f2,0xb46496ac,0x6f2d9bc9,0xd1925ab0,
|
||||||
|
0x78602649,0x29131ab6,0x8edae952,0x0fc053c3,
|
||||||
|
0x3b6ba548,0x3f014f0c,0xedae9520,0xfc053c31};
|
||||||
|
|
||||||
|
|
||||||
|
/***************************************************/
|
||||||
|
__device__ __forceinline__ void rnd512(hashState *state)
|
||||||
|
{
|
||||||
|
int i,j;
|
||||||
|
uint32_t t[40];
|
||||||
|
uint32_t chainv[8];
|
||||||
|
uint32_t tmp;
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
t[i]=0;
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
t[i] ^= state->chainv[i+8*j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MULT2(t, 0);
|
||||||
|
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[i+8*j] ^= t[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
t[i+8*j] = state->chainv[i+8*j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
MULT2(state->chainv, j);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[8*j+i] ^= t[8*((j+1)%5)+i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
t[i+8*j] = state->chainv[i+8*j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
MULT2(state->chainv, j);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[8*j+i] ^= t[8*((j+4)%5)+i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[i+8*j] ^= state->buffer[i];
|
||||||
|
}
|
||||||
|
MULT2(state->buffer, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
chainv[i] = state->chainv[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[i] = chainv[i];
|
||||||
|
chainv[i] = state->chainv[i+8];
|
||||||
|
}
|
||||||
|
|
||||||
|
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[i+8] = chainv[i];
|
||||||
|
chainv[i] = state->chainv[i+16];
|
||||||
|
}
|
||||||
|
|
||||||
|
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[i+16] = chainv[i];
|
||||||
|
chainv[i] = state->chainv[i+24];
|
||||||
|
}
|
||||||
|
|
||||||
|
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[i+24] = chainv[i];
|
||||||
|
chainv[i] = state->chainv[i+32];
|
||||||
|
}
|
||||||
|
|
||||||
|
TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
state->chainv[i+32] = chainv[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__device__ __forceinline__ void Update512(hashState *state, const BitSequence *data)
|
||||||
|
{
|
||||||
|
#pragma unroll 8
|
||||||
|
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
|
||||||
|
rnd512(state);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
|
||||||
|
rnd512(state);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/***************************************************/
|
||||||
|
__device__ __forceinline__ void finalization512(hashState *state, uint32_t *b)
|
||||||
|
{
|
||||||
|
int i,j;
|
||||||
|
|
||||||
|
state->buffer[0] = 0x80000000;
|
||||||
|
#pragma unroll 7
|
||||||
|
for(int i=1;i<8;i++) state->buffer[i] = 0;
|
||||||
|
rnd512(state);
|
||||||
|
|
||||||
|
/*---- blank round with m=0 ----*/
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) state->buffer[i] =0;
|
||||||
|
rnd512(state);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
b[i] = 0;
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
b[i] ^= state->chainv[i+8*j];
|
||||||
|
}
|
||||||
|
b[i] = BYTES_SWAP32((b[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) state->buffer[i]=0;
|
||||||
|
rnd512(state);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0;i<8;i++) {
|
||||||
|
b[8+i] = 0;
|
||||||
|
#pragma unroll 5
|
||||||
|
for(j=0;j<5;j++) {
|
||||||
|
b[8+i] ^= state->chainv[i+8*j];
|
||||||
|
}
|
||||||
|
b[8+i] = BYTES_SWAP32((b[8+i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/***************************************************/
|
||||||
|
// Die Hash-Funktion
|
||||||
|
__global__ void x11_luffa512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
||||||
|
{
|
||||||
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||||
|
|
||||||
|
int hashPosition = nounce - startNounce;
|
||||||
|
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
|
||||||
|
|
||||||
|
hashState state;
|
||||||
|
#pragma unroll 40
|
||||||
|
for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
|
||||||
|
#pragma unroll 8
|
||||||
|
for(int i=0;i<8;i++) state.buffer[i] = 0;
|
||||||
|
Update512(&state, (BitSequence*)Hash);
|
||||||
|
finalization512(&state, (uint32_t*)Hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Setup-Funktionen
|
||||||
|
__host__ void x11_luffa512_cpu_init(int thr_id, int threads)
|
||||||
|
{
|
||||||
|
cudaMemcpyToSymbol( c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice );
|
||||||
|
cudaMemcpyToSymbol( c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice );
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||||
|
{
|
||||||
|
const int threadsperblock = 256;
|
||||||
|
|
||||||
|
// berechne wie viele Thread Blocks wir brauchen
|
||||||
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
|
size_t shared_size = 0;
|
||||||
|
|
||||||
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
|
||||||
|
x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
|
MyStreamSynchronize(NULL, order, thr_id);
|
||||||
|
}
|
||||||
|
|
1380
x11/cuda_x11_shavite512.cu
Normal file
1380
x11/cuda_x11_shavite512.cu
Normal file
File diff suppressed because it is too large
Load Diff
765
x11/cuda_x11_simd512.cu
Normal file
765
x11/cuda_x11_simd512.cu
Normal file
@ -0,0 +1,765 @@
|
|||||||
|
// aus heavy.cu
|
||||||
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||||
|
|
||||||
|
typedef unsigned int uint32_t;
|
||||||
|
typedef unsigned long long uint64_t;
|
||||||
|
|
||||||
|
#define C32(x) ((uint32_t)(x ## U))
|
||||||
|
#define T32(x) ((x) & C32(0xFFFFFFFF))
|
||||||
|
|
||||||
|
#if __CUDA_ARCH__ < 350
|
||||||
|
// Kepler (Compute 3.0)
|
||||||
|
#define ROTL32(x, n) T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||||
|
#else
|
||||||
|
// Kepler (Compute 3.5)
|
||||||
|
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
|
||||||
|
#endif
|
||||||
|
|
||||||
|
__constant__ uint32_t c_IV_512[32];
|
||||||
|
const uint32_t h_IV_512[32] = {
|
||||||
|
0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558,
|
||||||
|
0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e,
|
||||||
|
0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257,
|
||||||
|
0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22
|
||||||
|
};
|
||||||
|
|
||||||
|
__constant__ int c_FFT[256];
|
||||||
|
const int h_FFT[256] =
|
||||||
|
{
|
||||||
|
// this is the FFT result in revbin permuted order
|
||||||
|
4, -4, 32, -32, -60, 60, 60, -60, 101, -101, 58, -58, 112, -112, -11, 11, -92, 92,
|
||||||
|
-119, 119, 42, -42, -82, 82, 32, -32, 32, -32, 121, -121, 17, -17, -47, 47, 63,
|
||||||
|
-63, 107, -107, -76, 76, -119, 119, -83, 83, 126, -126, 94, -94, -23, 23, -76,
|
||||||
|
76, -47, 47, 92, -92, -117, 117, 73, -73, -53, 53, 88, -88, -80, 80, -47, 47,
|
||||||
|
5, -5, 67, -67, 34, -34, 4, -4, 87, -87, -28, 28, -70, 70, -110, 110, -18, 18, 93,
|
||||||
|
-93, 51, -51, 36, -36, 118, -118, -106, 106, 45, -45, -108, 108, -44, 44, 117,
|
||||||
|
-117, -121, 121, -37, 37, 65, -65, 37, -37, 40, -40, -42, 42, 91, -91, -128, 128,
|
||||||
|
-21, 21, 94, -94, -98, 98, -47, 47, 28, -28, 115, -115, 16, -16, -20, 20, 122,
|
||||||
|
-122, 115, -115, 46, -46, 84, -84, -127, 127, 57, -57, 127, -127, -80, 80, 24,
|
||||||
|
-24, 15, -15, 29, -29, -78, 78, -126, 126, 16, -16, 52, -52, 55, -55, 110, -110,
|
||||||
|
-51, 51, -120, 120, -124, 124, -24, 24, -76, 76, 26, -26, -21, 21, -64, 64, -99,
|
||||||
|
99, 85, -85, -15, 15, -120, 120, -116, 116, 85, -85, 12, -12, -24, 24, 4, -4,
|
||||||
|
79, -79, 76, -76, 23, -23, 4, -4, -108, 108, -20, 20, 73, -73, -42, 42, -7, 7,
|
||||||
|
-29, 29, -123, 123, 49, -49, -96, 96, -68, 68, -112, 112, 116, -116, -24, 24, 93,
|
||||||
|
-93, -125, 125, -86, 86, 117, -117, -91, 91, 42, -42, 87, -87, -117, 117, 102, -102
|
||||||
|
};
|
||||||
|
|
||||||
|
__constant__ int c_P4[32][4];
|
||||||
|
static const int h_P4[32][4] = {
|
||||||
|
{ 2, 34, 18, 50 },
|
||||||
|
{ 6, 38, 22, 54 },
|
||||||
|
{ 0, 32, 16, 48 },
|
||||||
|
{ 4, 36, 20, 52 },
|
||||||
|
{ 14, 46, 30, 62 },
|
||||||
|
{ 10, 42, 26, 58 },
|
||||||
|
{ 12, 44, 28, 60 },
|
||||||
|
{ 8, 40, 24, 56 },
|
||||||
|
{ 15, 47, 31, 63 },
|
||||||
|
{ 13, 45, 29, 61 },
|
||||||
|
{ 3, 35, 19, 51 },
|
||||||
|
{ 1, 33, 17, 49 },
|
||||||
|
{ 9, 41, 25, 57 },
|
||||||
|
{ 11, 43, 27, 59 },
|
||||||
|
{ 5, 37, 21, 53 },
|
||||||
|
{ 7, 39, 23, 55 },
|
||||||
|
{ 8, 40, 24, 56 },
|
||||||
|
{ 4, 36, 20, 52 },
|
||||||
|
{ 14, 46, 30, 62 },
|
||||||
|
{ 2, 34, 18, 50 },
|
||||||
|
{ 6, 38, 22, 54 },
|
||||||
|
{ 10, 42, 26, 58 },
|
||||||
|
{ 0, 32, 16, 48 },
|
||||||
|
{ 12, 44, 28, 60 },
|
||||||
|
{ 70, 102, 86, 118 },
|
||||||
|
{ 64, 96, 80, 112 },
|
||||||
|
{ 72, 104, 88, 120 },
|
||||||
|
{ 78, 110, 94, 126 },
|
||||||
|
{ 76, 108, 92, 124 },
|
||||||
|
{ 74, 106, 90, 122 },
|
||||||
|
{ 66, 98, 82, 114 },
|
||||||
|
{ 68, 100, 84, 116 }
|
||||||
|
};
|
||||||
|
|
||||||
|
__constant__ int c_Q4[32][4];
|
||||||
|
static const int h_Q4[32][4] = {
|
||||||
|
{ 66, 98, 82, 114 },
|
||||||
|
{ 70, 102, 86, 118 },
|
||||||
|
{ 64, 96, 80, 112 },
|
||||||
|
{ 68, 100, 84, 116 },
|
||||||
|
{ 78, 110, 94, 126 },
|
||||||
|
{ 74, 106, 90, 122 },
|
||||||
|
{ 76, 108, 92, 124 },
|
||||||
|
{ 72, 104, 88, 120 },
|
||||||
|
{ 79, 111, 95, 127 },
|
||||||
|
{ 77, 109, 93, 125 },
|
||||||
|
{ 67, 99, 83, 115 },
|
||||||
|
{ 65, 97, 81, 113 },
|
||||||
|
{ 73, 105, 89, 121 },
|
||||||
|
{ 75, 107, 91, 123 },
|
||||||
|
{ 69, 101, 85, 117 },
|
||||||
|
{ 71, 103, 87, 119 },
|
||||||
|
{ 9, 41, 25, 57 },
|
||||||
|
{ 5, 37, 21, 53 },
|
||||||
|
{ 15, 47, 31, 63 },
|
||||||
|
{ 3, 35, 19, 51 },
|
||||||
|
{ 7, 39, 23, 55 },
|
||||||
|
{ 11, 43, 27, 59 },
|
||||||
|
{ 1, 33, 17, 49 },
|
||||||
|
{ 13, 45, 29, 61 },
|
||||||
|
{ 71, 103, 87, 119 },
|
||||||
|
{ 65, 97, 81, 113 },
|
||||||
|
{ 73, 105, 89, 121 },
|
||||||
|
{ 79, 111, 95, 127 },
|
||||||
|
{ 77, 109, 93, 125 },
|
||||||
|
{ 75, 107, 91, 123 },
|
||||||
|
{ 67, 99, 83, 115 },
|
||||||
|
{ 69, 101, 85, 117 }
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
__constant__ int c_P8[32][8];
|
||||||
|
static const int h_P8[32][8] = {
|
||||||
|
{ 2, 66, 34, 98, 18, 82, 50, 114 },
|
||||||
|
{ 6, 70, 38, 102, 22, 86, 54, 118 },
|
||||||
|
{ 0, 64, 32, 96, 16, 80, 48, 112 },
|
||||||
|
{ 4, 68, 36, 100, 20, 84, 52, 116 },
|
||||||
|
{ 14, 78, 46, 110, 30, 94, 62, 126 },
|
||||||
|
{ 10, 74, 42, 106, 26, 90, 58, 122 },
|
||||||
|
{ 12, 76, 44, 108, 28, 92, 60, 124 },
|
||||||
|
{ 8, 72, 40, 104, 24, 88, 56, 120 },
|
||||||
|
{ 15, 79, 47, 111, 31, 95, 63, 127 },
|
||||||
|
{ 13, 77, 45, 109, 29, 93, 61, 125 },
|
||||||
|
{ 3, 67, 35, 99, 19, 83, 51, 115 },
|
||||||
|
{ 1, 65, 33, 97, 17, 81, 49, 113 },
|
||||||
|
{ 9, 73, 41, 105, 25, 89, 57, 121 },
|
||||||
|
{ 11, 75, 43, 107, 27, 91, 59, 123 },
|
||||||
|
{ 5, 69, 37, 101, 21, 85, 53, 117 },
|
||||||
|
{ 7, 71, 39, 103, 23, 87, 55, 119 },
|
||||||
|
{ 8, 72, 40, 104, 24, 88, 56, 120 },
|
||||||
|
{ 4, 68, 36, 100, 20, 84, 52, 116 },
|
||||||
|
{ 14, 78, 46, 110, 30, 94, 62, 126 },
|
||||||
|
{ 2, 66, 34, 98, 18, 82, 50, 114 },
|
||||||
|
{ 6, 70, 38, 102, 22, 86, 54, 118 },
|
||||||
|
{ 10, 74, 42, 106, 26, 90, 58, 122 },
|
||||||
|
{ 0, 64, 32, 96, 16, 80, 48, 112 },
|
||||||
|
{ 12, 76, 44, 108, 28, 92, 60, 124 },
|
||||||
|
{ 134, 198, 166, 230, 150, 214, 182, 246 },
|
||||||
|
{ 128, 192, 160, 224, 144, 208, 176, 240 },
|
||||||
|
{ 136, 200, 168, 232, 152, 216, 184, 248 },
|
||||||
|
{ 142, 206, 174, 238, 158, 222, 190, 254 },
|
||||||
|
{ 140, 204, 172, 236, 156, 220, 188, 252 },
|
||||||
|
{ 138, 202, 170, 234, 154, 218, 186, 250 },
|
||||||
|
{ 130, 194, 162, 226, 146, 210, 178, 242 },
|
||||||
|
{ 132, 196, 164, 228, 148, 212, 180, 244 },
|
||||||
|
};
|
||||||
|
|
||||||
|
__constant__ int c_Q8[32][8];
|
||||||
|
static const int h_Q8[32][8] = {
|
||||||
|
{ 130, 194, 162, 226, 146, 210, 178, 242 },
|
||||||
|
{ 134, 198, 166, 230, 150, 214, 182, 246 },
|
||||||
|
{ 128, 192, 160, 224, 144, 208, 176, 240 },
|
||||||
|
{ 132, 196, 164, 228, 148, 212, 180, 244 },
|
||||||
|
{ 142, 206, 174, 238, 158, 222, 190, 254 },
|
||||||
|
{ 138, 202, 170, 234, 154, 218, 186, 250 },
|
||||||
|
{ 140, 204, 172, 236, 156, 220, 188, 252 },
|
||||||
|
{ 136, 200, 168, 232, 152, 216, 184, 248 },
|
||||||
|
{ 143, 207, 175, 239, 159, 223, 191, 255 },
|
||||||
|
{ 141, 205, 173, 237, 157, 221, 189, 253 },
|
||||||
|
{ 131, 195, 163, 227, 147, 211, 179, 243 },
|
||||||
|
{ 129, 193, 161, 225, 145, 209, 177, 241 },
|
||||||
|
{ 137, 201, 169, 233, 153, 217, 185, 249 },
|
||||||
|
{ 139, 203, 171, 235, 155, 219, 187, 251 },
|
||||||
|
{ 133, 197, 165, 229, 149, 213, 181, 245 },
|
||||||
|
{ 135, 199, 167, 231, 151, 215, 183, 247 },
|
||||||
|
{ 9, 73, 41, 105, 25, 89, 57, 121 },
|
||||||
|
{ 5, 69, 37, 101, 21, 85, 53, 117 },
|
||||||
|
{ 15, 79, 47, 111, 31, 95, 63, 127 },
|
||||||
|
{ 3, 67, 35, 99, 19, 83, 51, 115 },
|
||||||
|
{ 7, 71, 39, 103, 23, 87, 55, 119 },
|
||||||
|
{ 11, 75, 43, 107, 27, 91, 59, 123 },
|
||||||
|
{ 1, 65, 33, 97, 17, 81, 49, 113 },
|
||||||
|
{ 13, 77, 45, 109, 29, 93, 61, 125 },
|
||||||
|
{ 135, 199, 167, 231, 151, 215, 183, 247 },
|
||||||
|
{ 129, 193, 161, 225, 145, 209, 177, 241 },
|
||||||
|
{ 137, 201, 169, 233, 153, 217, 185, 249 },
|
||||||
|
{ 143, 207, 175, 239, 159, 223, 191, 255 },
|
||||||
|
{ 141, 205, 173, 237, 157, 221, 189, 253 },
|
||||||
|
{ 139, 203, 171, 235, 155, 219, 187, 251 },
|
||||||
|
{ 131, 195, 163, 227, 147, 211, 179, 243 },
|
||||||
|
{ 133, 197, 165, 229, 149, 213, 181, 245 },
|
||||||
|
};
|
||||||
|
|
||||||
|
__constant__ int c_FFT64_8_8_Twiddle[64];
|
||||||
|
static const int h_FFT64_8_8_Twiddle[64] = {
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 2, 4, 8, 16, 32, 64, 128,
|
||||||
|
1, 60, 2, 120, 4, -17, 8, -34,
|
||||||
|
1, 120, 8, -68, 64, -30, -2, 17,
|
||||||
|
1, 46, 60, -67, 2, 92, 120, 123,
|
||||||
|
1, 92, -17, -22, 32, 117, -30, 67,
|
||||||
|
1, -67, 120, -73, 8, -22, -68, -70,
|
||||||
|
1, 123, -34, -70, 128, 67, 17, 35,
|
||||||
|
};
|
||||||
|
|
||||||
|
__constant__ int c_FFT128_2_64_Twiddle[64];
|
||||||
|
static const int h_FFT128_2_64_Twiddle[64] = {
|
||||||
|
1, -118, 46, -31, 60, 116, -67, -61,
|
||||||
|
2, 21, 92, -62, 120, -25, 123, -122,
|
||||||
|
4, 42, -73, -124, -17, -50, -11, 13,
|
||||||
|
8, 84, 111, 9, -34, -100, -22, 26,
|
||||||
|
16, -89, -35, 18, -68, 57, -44, 52,
|
||||||
|
32, 79, -70, 36, 121, 114, -88, 104,
|
||||||
|
64, -99, 117, 72, -15, -29, 81, -49,
|
||||||
|
128, 59, -23, -113, -30, -58, -95, -98
|
||||||
|
};
|
||||||
|
|
||||||
|
__constant__ int c_FFT128_16_8_Twiddle[128];
|
||||||
|
static const int h_FFT128_16_8_Twiddle[128] = {
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 2, 4, 8, 16, 32, 64, 128,
|
||||||
|
1, 60, 2, 120, 4, -17, 8, -34,
|
||||||
|
1, 120, 8, -68, 64, -30, -2, 17,
|
||||||
|
1, 46, 60, -67, 2, 92, 120, 123,
|
||||||
|
1, 92, -17, -22, 32, 117, -30, 67,
|
||||||
|
1, -67, 120, -73, 8, -22, -68, -70,
|
||||||
|
1, 123, -34, -70, 128, 67, 17, 35,
|
||||||
|
1, -118, 46, -31, 60, 116, -67, -61,
|
||||||
|
1, 21, -73, 9, -68, 114, 81, -98,
|
||||||
|
1, 116, 92, -122, -17, 84, -22, 18,
|
||||||
|
1, -25, 111, 52, -15, 118, -123, -9,
|
||||||
|
1, -31, -67, 21, 120, -122, -73, -50,
|
||||||
|
1, -62, -11, -89, 121, -49, -46, 25,
|
||||||
|
1, -61, 123, -50, -34, 18, -70, -99,
|
||||||
|
1, -122, -22, 114, -30, 62, -111, -79 };
|
||||||
|
|
||||||
|
__constant__ int c_FFT128_8_16_Twiddle[128];
|
||||||
|
static const int h_FFT128_8_16_Twiddle[128] = {
|
||||||
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
|
1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30,
|
||||||
|
1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22,
|
||||||
|
1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111,
|
||||||
|
1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122,
|
||||||
|
1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62,
|
||||||
|
1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114,
|
||||||
|
1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79};
|
||||||
|
|
||||||
|
|
||||||
|
__constant__ int c_FFT256_2_128_Twiddle[128];
|
||||||
|
static const int h_FFT256_2_128_Twiddle[128] = {
|
||||||
|
1, 41, -118, 45, 46, 87, -31, 14,
|
||||||
|
60, -110, 116, -127, -67, 80, -61, 69,
|
||||||
|
2, 82, 21, 90, 92, -83, -62, 28,
|
||||||
|
120, 37, -25, 3, 123, -97, -122, -119,
|
||||||
|
4, -93, 42, -77, -73, 91, -124, 56,
|
||||||
|
-17, 74, -50, 6, -11, 63, 13, 19,
|
||||||
|
8, 71, 84, 103, 111, -75, 9, 112,
|
||||||
|
-34, -109, -100, 12, -22, 126, 26, 38,
|
||||||
|
16, -115, -89, -51, -35, 107, 18, -33,
|
||||||
|
-68, 39, 57, 24, -44, -5, 52, 76,
|
||||||
|
32, 27, 79, -102, -70, -43, 36, -66,
|
||||||
|
121, 78, 114, 48, -88, -10, 104, -105,
|
||||||
|
64, 54, -99, 53, 117, -86, 72, 125,
|
||||||
|
-15, -101, -29, 96, 81, -20, -49, 47,
|
||||||
|
128, 108, 59, 106, -23, 85, -113, -7,
|
||||||
|
-30, 55, -58, -65, -95, -40, -98, 94};
|
||||||
|
|
||||||
|
#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \
|
||||||
|
((x)%7) == 1 ? 6 : \
|
||||||
|
((x)%7) == 2 ? 2 : \
|
||||||
|
((x)%7) == 3 ? 3 : \
|
||||||
|
((x)%7) == 4 ? 5 : \
|
||||||
|
((x)%7) == 5 ? 7 : \
|
||||||
|
4 )
|
||||||
|
|
||||||
|
/************* the round function ****************/
|
||||||
|
|
||||||
|
#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
|
||||||
|
|
||||||
|
#define MAJ(x, y, z) (((z) & (y)) | (((z) | (y)) & (x)))
|
||||||
|
|
||||||
|
__device__ __forceinline__ void STEP8_IF(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
|
||||||
|
{
|
||||||
|
int j;
|
||||||
|
uint32_t R[8];
|
||||||
|
#pragma unroll 8
|
||||||
|
for(j=0; j<8; j++) {
|
||||||
|
R[j] = ROTL32(A[j], r);
|
||||||
|
}
|
||||||
|
#pragma unroll 8
|
||||||
|
for(j=0; j<8; j++) {
|
||||||
|
D[j] = D[j] + w[j] + IF(A[j], B[j], C[j]);
|
||||||
|
D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]);
|
||||||
|
A[j] = R[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void STEP8_MAJ(const uint32_t *w, const int i, const int r, const int s, uint32_t *A, const uint32_t *B, const uint32_t *C, uint32_t *D)
|
||||||
|
{
|
||||||
|
int j;
|
||||||
|
uint32_t R[8];
|
||||||
|
#pragma unroll 8
|
||||||
|
for(j=0; j<8; j++) {
|
||||||
|
R[j] = ROTL32(A[j], r);
|
||||||
|
}
|
||||||
|
#pragma unroll 8
|
||||||
|
for(j=0; j<8; j++) {
|
||||||
|
D[j] = D[j] + w[j] + MAJ(A[j], B[j], C[j]);
|
||||||
|
D[j] = T32(ROTL32(T32(D[j]), s) + R[j^p8_xor(i)]);
|
||||||
|
A[j] = R[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void Round8(uint32_t A[128], const int y[128], int i,
|
||||||
|
int r, int s, int t, int u) {
|
||||||
|
int code = i<2? 185: 233;
|
||||||
|
uint32_t w[8][8];
|
||||||
|
int a, b;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The FFT output y is in revbin permuted order,
|
||||||
|
* but this is included in the tables P and Q
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(a=0; a<8; a++)
|
||||||
|
#pragma unroll 8
|
||||||
|
for(b=0; b<8; b++)
|
||||||
|
w[a][b] = __byte_perm( (y[c_P8[8*i+a][b]] * code), (y[c_Q8[8*i+a][b]] * code), 0x5410);
|
||||||
|
|
||||||
|
STEP8_IF(w[0], 8*i+0, r, s, A, &A[8], &A[16], &A[24]);
|
||||||
|
STEP8_IF(w[1], 8*i+1, s, t, &A[24], A, &A[8], &A[16]);
|
||||||
|
STEP8_IF(w[2], 8*i+2, t, u, &A[16], &A[24], A, &A[8]);
|
||||||
|
STEP8_IF(w[3], 8*i+3, u, r, &A[8], &A[16], &A[24], A);
|
||||||
|
|
||||||
|
STEP8_MAJ(w[4], 8*i+4, r, s, A, &A[8], &A[16], &A[24]);
|
||||||
|
STEP8_MAJ(w[5], 8*i+5, s, t, &A[24], A, &A[8], &A[16]);
|
||||||
|
STEP8_MAJ(w[6], 8*i+6, t, u, &A[16], &A[24], A, &A[8]);
|
||||||
|
STEP8_MAJ(w[7], 8*i+7, u, r, &A[8], &A[16], &A[24], A);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/********************* Message expansion ************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reduce modulo 257; result is in [-127; 383]
|
||||||
|
* REDUCE(x) := (x&255) - (x>>8)
|
||||||
|
*/
|
||||||
|
#define REDUCE(x) (((x)&255) - ((x)>>8))
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reduce from [-127; 383] to [-128; 128]
|
||||||
|
* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257
|
||||||
|
*/
|
||||||
|
#define EXTRA_REDUCE_S(x) \
|
||||||
|
((x)<=128 ? (x) : (x)-257)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Reduce modulo 257; result is in [-128; 128]
|
||||||
|
*/
|
||||||
|
#define REDUCE_FULL_S(x) \
|
||||||
|
EXTRA_REDUCE_S(REDUCE(x))
|
||||||
|
|
||||||
|
__device__ __forceinline__ void FFT_8(int *y, int stripe) {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FFT_8 using w=4 as 8th root of unity
|
||||||
|
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||||
|
* Output data is in revbin_permuted order.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define X(i) y[stripe*i]
|
||||||
|
|
||||||
|
#define DO_REDUCE(i) \
|
||||||
|
X(i) = REDUCE(X(i))
|
||||||
|
|
||||||
|
#define DO_REDUCE_FULL_S(i) \
|
||||||
|
do { \
|
||||||
|
X(i) = REDUCE(X(i)); \
|
||||||
|
X(i) = EXTRA_REDUCE_S(X(i)); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#define BUTTERFLY(i,j,n) \
|
||||||
|
do { \
|
||||||
|
int u= X(i); \
|
||||||
|
int v= X(j); \
|
||||||
|
X(i) = u+v; \
|
||||||
|
X(j) = (u-v) << (2*n); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
BUTTERFLY(0, 4, 0);
|
||||||
|
BUTTERFLY(1, 5, 1);
|
||||||
|
BUTTERFLY(2, 6, 2);
|
||||||
|
BUTTERFLY(3, 7, 3);
|
||||||
|
|
||||||
|
DO_REDUCE(6);
|
||||||
|
DO_REDUCE(7);
|
||||||
|
|
||||||
|
BUTTERFLY(0, 2, 0);
|
||||||
|
BUTTERFLY(4, 6, 0);
|
||||||
|
BUTTERFLY(1, 3, 2);
|
||||||
|
BUTTERFLY(5, 7, 2);
|
||||||
|
|
||||||
|
DO_REDUCE(7);
|
||||||
|
|
||||||
|
BUTTERFLY(0, 1, 0);
|
||||||
|
BUTTERFLY(2, 3, 0);
|
||||||
|
BUTTERFLY(4, 5, 0);
|
||||||
|
BUTTERFLY(6, 7, 0);
|
||||||
|
|
||||||
|
DO_REDUCE_FULL_S(0);
|
||||||
|
DO_REDUCE_FULL_S(1);
|
||||||
|
DO_REDUCE_FULL_S(2);
|
||||||
|
DO_REDUCE_FULL_S(3);
|
||||||
|
DO_REDUCE_FULL_S(4);
|
||||||
|
DO_REDUCE_FULL_S(5);
|
||||||
|
DO_REDUCE_FULL_S(6);
|
||||||
|
DO_REDUCE_FULL_S(7);
|
||||||
|
|
||||||
|
#undef X
|
||||||
|
#undef DO_REDUCE
|
||||||
|
#undef DO_REDUCE_FULL_S
|
||||||
|
#undef BUTTERFLY
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void FFT_16(int *y, int stripe) {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FFT_16 using w=2 as 16th root of unity
|
||||||
|
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||||
|
* Output data is in revbin_permuted order.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define X(i) y[stripe*i]
|
||||||
|
|
||||||
|
#define DO_REDUCE(i) \
|
||||||
|
X(i) = REDUCE(X(i))
|
||||||
|
|
||||||
|
#define DO_REDUCE_FULL_S(i) \
|
||||||
|
do { \
|
||||||
|
X(i) = REDUCE(X(i)); \
|
||||||
|
X(i) = EXTRA_REDUCE_S(X(i)); \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
#define BUTTERFLY(i,j,n) \
|
||||||
|
do { \
|
||||||
|
int u= X(i); \
|
||||||
|
int v= X(j); \
|
||||||
|
X(i) = u+v; \
|
||||||
|
X(j) = (u-v) << n; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
|
BUTTERFLY(0, 8, 0);
|
||||||
|
BUTTERFLY(1, 9, 1);
|
||||||
|
BUTTERFLY(2, 10, 2);
|
||||||
|
BUTTERFLY(3, 11, 3);
|
||||||
|
BUTTERFLY(4, 12, 4);
|
||||||
|
BUTTERFLY(5, 13, 5);
|
||||||
|
BUTTERFLY(6, 14, 6);
|
||||||
|
BUTTERFLY(7, 15, 7);
|
||||||
|
|
||||||
|
DO_REDUCE(11);
|
||||||
|
DO_REDUCE(12);
|
||||||
|
DO_REDUCE(13);
|
||||||
|
DO_REDUCE(14);
|
||||||
|
DO_REDUCE(15);
|
||||||
|
|
||||||
|
BUTTERFLY( 0, 4, 0);
|
||||||
|
BUTTERFLY( 8, 12, 0);
|
||||||
|
BUTTERFLY( 1, 5, 2);
|
||||||
|
BUTTERFLY( 9, 13, 2);
|
||||||
|
BUTTERFLY( 2, 6, 4);
|
||||||
|
BUTTERFLY(10, 14, 4);
|
||||||
|
BUTTERFLY( 3, 7, 6);
|
||||||
|
BUTTERFLY(11, 15, 6);
|
||||||
|
|
||||||
|
DO_REDUCE(5);
|
||||||
|
DO_REDUCE(7);
|
||||||
|
DO_REDUCE(13);
|
||||||
|
DO_REDUCE(15);
|
||||||
|
|
||||||
|
BUTTERFLY( 0, 2, 0);
|
||||||
|
BUTTERFLY( 4, 6, 0);
|
||||||
|
BUTTERFLY( 8, 10, 0);
|
||||||
|
BUTTERFLY(12, 14, 0);
|
||||||
|
BUTTERFLY( 1, 3, 4);
|
||||||
|
BUTTERFLY( 5, 7, 4);
|
||||||
|
BUTTERFLY( 9, 11, 4);
|
||||||
|
BUTTERFLY(13, 15, 4);
|
||||||
|
|
||||||
|
BUTTERFLY( 0, 1, 0);
|
||||||
|
BUTTERFLY( 2, 3, 0);
|
||||||
|
BUTTERFLY( 4, 5, 0);
|
||||||
|
BUTTERFLY( 6, 7, 0);
|
||||||
|
BUTTERFLY( 8, 9, 0);
|
||||||
|
BUTTERFLY(10, 11, 0);
|
||||||
|
BUTTERFLY(12, 13, 0);
|
||||||
|
BUTTERFLY(14, 15, 0);
|
||||||
|
|
||||||
|
DO_REDUCE_FULL_S( 0);
|
||||||
|
DO_REDUCE_FULL_S( 1);
|
||||||
|
DO_REDUCE_FULL_S( 2);
|
||||||
|
DO_REDUCE_FULL_S( 3);
|
||||||
|
DO_REDUCE_FULL_S( 4);
|
||||||
|
DO_REDUCE_FULL_S( 5);
|
||||||
|
DO_REDUCE_FULL_S( 6);
|
||||||
|
DO_REDUCE_FULL_S( 7);
|
||||||
|
DO_REDUCE_FULL_S( 8);
|
||||||
|
DO_REDUCE_FULL_S( 9);
|
||||||
|
DO_REDUCE_FULL_S(10);
|
||||||
|
DO_REDUCE_FULL_S(11);
|
||||||
|
DO_REDUCE_FULL_S(12);
|
||||||
|
DO_REDUCE_FULL_S(13);
|
||||||
|
DO_REDUCE_FULL_S(14);
|
||||||
|
DO_REDUCE_FULL_S(15);
|
||||||
|
|
||||||
|
#undef X
|
||||||
|
#undef DO_REDUCE
|
||||||
|
#undef DO_REDUCE_FULL_S
|
||||||
|
#undef BUTTERFLY
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void FFT_64(int *y) {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FFT_64 using w=46 as 64th root of unity
|
||||||
|
* decimation in frequency (DIF) radix-8 NTT.
|
||||||
|
* Output data is in revbin_permuted order.
|
||||||
|
*/
|
||||||
|
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Begin with 8 parallels DIF FFT_8.
|
||||||
|
*/
|
||||||
|
#pragma unroll 8
|
||||||
|
for (i=0; i<8; i++) {
|
||||||
|
FFT_8(y+i,8);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Multiply by twiddle factors
|
||||||
|
*/
|
||||||
|
#pragma unroll 56
|
||||||
|
for (i=8; i<64; i++)
|
||||||
|
if (i & 7) y[i] = REDUCE(y[i]*c_FFT64_8_8_Twiddle[i]);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Finish with 8 paralles DIF FFT_8.
|
||||||
|
*/
|
||||||
|
#pragma unroll 8
|
||||||
|
for (i=0; i<8; i++) {
|
||||||
|
FFT_8(y+8*i,1);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void FFT_128_halfzero(int *y) {
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FFT_128 using w=139 as 128th root of unity.
|
||||||
|
* Decimation in frequency (DIF) NTT.
|
||||||
|
* Output data is in revbin_permuted order.
|
||||||
|
* In place.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const int tmp = y[63];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
#pragma unroll 63
|
||||||
|
for (i=0; i<63; i++)
|
||||||
|
y[64+i] = REDUCE(y[i] * c_FFT128_2_64_Twiddle[i]);
|
||||||
|
|
||||||
|
/* handle X^127 */
|
||||||
|
y[63] = REDUCE(tmp + 1);
|
||||||
|
y[127] = REDUCE((tmp - 1) * c_FFT128_2_64_Twiddle[63]);
|
||||||
|
|
||||||
|
FFT_64(y);
|
||||||
|
FFT_64(y+64);
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ __forceinline__ void FFT_128_full(int *y) {
|
||||||
|
int i;
|
||||||
|
|
||||||
|
#pragma unroll 16
|
||||||
|
for (i=0; i<16; i++) {
|
||||||
|
FFT_8(y+i,16);
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll 128
|
||||||
|
for (i=0; i<128; i++)
|
||||||
|
/*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i]);
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for (i=0; i<8; i++) {
|
||||||
|
FFT_16(y+16*i,1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__device__ __forceinline__ void FFT_256_halfzero(int *y) {
|
||||||
|
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* FFT_256 using w=41 as 256th root of unity.
|
||||||
|
* Decimation in frequency (DIF) NTT.
|
||||||
|
* Output data is in revbin_permuted order.
|
||||||
|
* In place.
|
||||||
|
*/
|
||||||
|
const int tmp = y[127];
|
||||||
|
|
||||||
|
#pragma unroll 127
|
||||||
|
for (i=0; i<127; i++)
|
||||||
|
y[128+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[i]);
|
||||||
|
|
||||||
|
/* handle X^255 with an additionnal butterfly */
|
||||||
|
y[127] = REDUCE(tmp + 1);
|
||||||
|
y[255] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]);
|
||||||
|
|
||||||
|
FFT_128_full(y);
|
||||||
|
FFT_128_full(y+128);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__device__ __forceinline__ void SIMD_Compress(uint32_t A[128], const int *expanded, const uint32_t *M) {
|
||||||
|
|
||||||
|
uint32_t IV[4][8];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* Save the chaining value for the feed-forward */
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0; i<8; i++) {
|
||||||
|
IV[0][i] = A[i];
|
||||||
|
IV[1][i] = (&A[8])[i];
|
||||||
|
IV[2][i] = (&A[16])[i];
|
||||||
|
IV[3][i] = (&A[24])[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* XOR the message to the chaining value */
|
||||||
|
/* we can XOR word-by-word */
|
||||||
|
|
||||||
|
{
|
||||||
|
#pragma unroll 8
|
||||||
|
for(i=0; i<8; i++) {
|
||||||
|
A[i] ^= M[i];
|
||||||
|
(&A[8])[i] ^= M[8+i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Run the feistel ladders with the expanded message */
|
||||||
|
{
|
||||||
|
Round8(A, expanded, 0, 3, 23, 17, 27);
|
||||||
|
Round8(A, expanded, 1, 28, 19, 22, 7);
|
||||||
|
Round8(A, expanded, 2, 29, 9, 15, 5);
|
||||||
|
Round8(A, expanded, 3, 4, 13, 10, 25);
|
||||||
|
|
||||||
|
STEP8_IF(IV[0], 32, 4, 13, A, &A[8], &A[16], &A[24]);
|
||||||
|
STEP8_IF(IV[1], 33, 13, 10, &A[24], A, &A[8], &A[16]);
|
||||||
|
STEP8_IF(IV[2], 34, 10, 25, &A[16], &A[24], A, &A[8]);
|
||||||
|
STEP8_IF(IV[3], 35, 25, 4, &A[8], &A[16], &A[24], A);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/***************************************************/
|
||||||
|
|
||||||
|
__device__ __forceinline__ void SIMDHash(const uint32_t *data, uint32_t *hashval) {
|
||||||
|
|
||||||
|
uint32_t A[128];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
uint32_t buffer[16];
|
||||||
|
|
||||||
|
#pragma unroll 32
|
||||||
|
for (i=0; i < 32; i++) A[i] = c_IV_512[i];
|
||||||
|
|
||||||
|
#pragma unroll 16
|
||||||
|
for (i=0; i < 16; i++) buffer[i] = data[i];
|
||||||
|
|
||||||
|
/* Message Expansion using Number Theoretical Transform similar to FFT */
|
||||||
|
int expanded[256];
|
||||||
|
{
|
||||||
|
#pragma unroll 16
|
||||||
|
for(i=0; i<64; i+=4) {
|
||||||
|
expanded[i+0] = __byte_perm(buffer[i/4],0,0x4440);
|
||||||
|
expanded[i+1] = __byte_perm(buffer[i/4],0,0x4441);
|
||||||
|
expanded[i+2] = __byte_perm(buffer[i/4],0,0x4442);
|
||||||
|
expanded[i+3] = __byte_perm(buffer[i/4],0,0x4443);
|
||||||
|
}
|
||||||
|
#pragma unroll 16
|
||||||
|
for(i=64; i<128; i+=4) {
|
||||||
|
expanded[i+0] = 0;
|
||||||
|
expanded[i+1] = 0;
|
||||||
|
expanded[i+2] = 0;
|
||||||
|
expanded[i+3] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
FFT_256_halfzero(expanded);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compression Function */
|
||||||
|
SIMD_Compress(A, expanded, buffer);
|
||||||
|
|
||||||
|
/* Padding Round with known input (hence the FFT can be precomputed) */
|
||||||
|
buffer[0] = 512;
|
||||||
|
#pragma unroll 15
|
||||||
|
for (i=1; i < 16; i++) buffer[i] = 0;
|
||||||
|
|
||||||
|
SIMD_Compress(A, c_FFT, buffer);
|
||||||
|
|
||||||
|
#pragma unroll 16
|
||||||
|
for (i=0; i < 16; i++)
|
||||||
|
hashval[i] = A[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
/***************************************************/
|
||||||
|
// Die Hash-Funktion
|
||||||
|
__global__ void x11_simd512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
|
||||||
|
{
|
||||||
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (thread < threads)
|
||||||
|
{
|
||||||
|
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||||
|
|
||||||
|
int hashPosition = nounce - startNounce;
|
||||||
|
uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
|
||||||
|
|
||||||
|
SIMDHash(Hash, Hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Setup-Funktionen
|
||||||
|
__host__ void x11_simd512_cpu_init(int thr_id, int threads)
|
||||||
|
{
|
||||||
|
cudaMemcpyToSymbol( c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_FFT, h_FFT, sizeof(h_FFT), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_P4, h_P4, sizeof(h_P4), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_Q4, h_Q4, sizeof(h_Q4), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_P8, h_P8, sizeof(h_P8), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_Q8, h_Q8, sizeof(h_Q8), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_FFT64_8_8_Twiddle, h_FFT64_8_8_Twiddle, sizeof(h_FFT64_8_8_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_FFT128_2_64_Twiddle, h_FFT128_2_64_Twiddle, sizeof(h_FFT128_2_64_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_FFT128_16_8_Twiddle, h_FFT128_16_8_Twiddle, sizeof(h_FFT128_16_8_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpyToSymbol( c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice);
|
||||||
|
}
|
||||||
|
|
||||||
|
__host__ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||||
|
{
|
||||||
|
const int threadsperblock = 256;
|
||||||
|
|
||||||
|
// berechne wie viele Thread Blocks wir brauchen
|
||||||
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
|
size_t shared_size = 0;
|
||||||
|
|
||||||
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
|
||||||
|
x11_simd512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||||
|
MyStreamSynchronize(NULL, order, thr_id);
|
||||||
|
}
|
||||||
|
|
262
x11/x11.cu
Normal file
262
x11/x11.cu
Normal file
@ -0,0 +1,262 @@
|
|||||||
|
|
||||||
|
extern "C"
|
||||||
|
{
|
||||||
|
#include "sph/sph_blake.h"
|
||||||
|
#include "sph/sph_bmw.h"
|
||||||
|
#include "sph/sph_groestl.h"
|
||||||
|
#include "sph/sph_skein.h"
|
||||||
|
#include "sph/sph_jh.h"
|
||||||
|
#include "sph/sph_keccak.h"
|
||||||
|
|
||||||
|
#include "sph/sph_luffa.h"
|
||||||
|
#include "sph/sph_cubehash.h"
|
||||||
|
#include "sph/sph_shavite.h"
|
||||||
|
#include "sph/sph_simd.h"
|
||||||
|
#include "sph/sph_echo.h"
|
||||||
|
|
||||||
|
#include "miner.h"
|
||||||
|
}
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
// aus cpu-miner.c
|
||||||
|
extern int device_map[8];
|
||||||
|
|
||||||
|
// Speicher für Input/Output der verketteten Hashfunktionen
|
||||||
|
static uint32_t *d_hash[8];
|
||||||
|
|
||||||
|
extern void quark_blake512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void quark_blake512_cpu_setBlock_80(void *pdata);
|
||||||
|
extern void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
|
||||||
|
extern void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void quark_bmw512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void quark_groestl512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
extern void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void quark_skein512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void quark_keccak512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void quark_jh512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void x11_luffa512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void x11_cubehash512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void x11_shavite512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void x11_simd512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void x11_echo512_cpu_init(int thr_id, int threads);
|
||||||
|
extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
|
||||||
|
|
||||||
|
extern void quark_check_cpu_init(int thr_id, int threads);
|
||||||
|
extern void quark_check_cpu_setTarget(const void *ptarget);
|
||||||
|
extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
|
||||||
|
|
||||||
|
extern void quark_compactTest_cpu_init(int thr_id, int threads);
|
||||||
|
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
|
||||||
|
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
|
||||||
|
int order);
|
||||||
|
|
||||||
|
// X11 Hashfunktion
|
||||||
|
inline void x11hash(void *state, const void *input)
|
||||||
|
{
|
||||||
|
// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11
|
||||||
|
|
||||||
|
sph_blake512_context ctx_blake;
|
||||||
|
sph_bmw512_context ctx_bmw;
|
||||||
|
sph_groestl512_context ctx_groestl;
|
||||||
|
sph_jh512_context ctx_jh;
|
||||||
|
sph_keccak512_context ctx_keccak;
|
||||||
|
sph_skein512_context ctx_skein;
|
||||||
|
sph_luffa512_context ctx_luffa;
|
||||||
|
sph_cubehash512_context ctx_cubehash;
|
||||||
|
sph_shavite512_context ctx_shavite;
|
||||||
|
sph_simd512_context ctx_simd;
|
||||||
|
sph_echo512_context ctx_echo;
|
||||||
|
|
||||||
|
unsigned char hash[64];
|
||||||
|
|
||||||
|
sph_blake512_init(&ctx_blake);
|
||||||
|
// ZBLAKE;
|
||||||
|
sph_blake512 (&ctx_blake, input, 80);
|
||||||
|
sph_blake512_close(&ctx_blake, (void*) hash);
|
||||||
|
|
||||||
|
sph_bmw512_init(&ctx_bmw);
|
||||||
|
// ZBMW;
|
||||||
|
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
|
||||||
|
sph_bmw512_close(&ctx_bmw, (void*) hash);
|
||||||
|
|
||||||
|
sph_groestl512_init(&ctx_groestl);
|
||||||
|
// ZGROESTL;
|
||||||
|
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
|
||||||
|
sph_groestl512_close(&ctx_groestl, (void*) hash);
|
||||||
|
|
||||||
|
sph_skein512_init(&ctx_skein);
|
||||||
|
// ZSKEIN;
|
||||||
|
sph_skein512 (&ctx_skein, (const void*) hash, 64);
|
||||||
|
sph_skein512_close(&ctx_skein, (void*) hash);
|
||||||
|
|
||||||
|
sph_jh512_init(&ctx_jh);
|
||||||
|
// ZJH;
|
||||||
|
sph_jh512 (&ctx_jh, (const void*) hash, 64);
|
||||||
|
sph_jh512_close(&ctx_jh, (void*) hash);
|
||||||
|
|
||||||
|
sph_keccak512_init(&ctx_keccak);
|
||||||
|
// ZKECCAK;
|
||||||
|
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
|
||||||
|
sph_keccak512_close(&ctx_keccak, (void*) hash);
|
||||||
|
|
||||||
|
sph_luffa512_init(&ctx_luffa);
|
||||||
|
// ZLUFFA;
|
||||||
|
sph_luffa512 (&ctx_luffa, (const void*) hash, 64);
|
||||||
|
sph_luffa512_close (&ctx_luffa, (void*) hash);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
sph_cubehash512_init(&ctx_cubehash);
|
||||||
|
// ZCUBEHASH;
|
||||||
|
sph_cubehash512 (&ctx_cubehash, (const void*) hash, 64);
|
||||||
|
sph_cubehash512_close(&ctx_cubehash, (void*) hash);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
sph_shavite512_init(&ctx_shavite);
|
||||||
|
// ZSHAVITE;
|
||||||
|
sph_shavite512 (&ctx_shavite, (const void*) hash, 64);
|
||||||
|
sph_shavite512_close(&ctx_shavite, (void*) hash);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
sph_simd512_init(&ctx_simd);
|
||||||
|
// ZSIMD
|
||||||
|
sph_simd512 (&ctx_simd, (const void*) hash, 64);
|
||||||
|
sph_simd512_close(&ctx_simd, (void*) hash);
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
sph_echo512_init(&ctx_echo);
|
||||||
|
// ZECHO
|
||||||
|
sph_echo512 (&ctx_echo, (const void*) hash, 64);
|
||||||
|
sph_echo512_close(&ctx_echo, (void*) hash);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
memcpy(state, hash, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
extern bool opt_benchmark;
|
||||||
|
|
||||||
|
extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||||
|
const uint32_t *ptarget, uint32_t max_nonce,
|
||||||
|
unsigned long *hashes_done)
|
||||||
|
{
|
||||||
|
const uint32_t first_nonce = pdata[19];
|
||||||
|
|
||||||
|
// TODO: entfernen für eine Release! Ist nur zum Testen!
|
||||||
|
if (opt_benchmark)
|
||||||
|
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||||
|
|
||||||
|
const uint32_t Htarg = ptarget[7];
|
||||||
|
|
||||||
|
const int throughput = 256*256; // 100;
|
||||||
|
|
||||||
|
static bool init[8] = {0,0,0,0,0,0,0,0};
|
||||||
|
if (!init[thr_id])
|
||||||
|
{
|
||||||
|
cudaSetDevice(device_map[thr_id]);
|
||||||
|
|
||||||
|
// Konstanten kopieren, Speicher belegen
|
||||||
|
cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
|
||||||
|
quark_blake512_cpu_init(thr_id, throughput);
|
||||||
|
quark_groestl512_cpu_init(thr_id, throughput);
|
||||||
|
quark_skein512_cpu_init(thr_id, throughput);
|
||||||
|
quark_bmw512_cpu_init(thr_id, throughput);
|
||||||
|
quark_keccak512_cpu_init(thr_id, throughput);
|
||||||
|
quark_jh512_cpu_init(thr_id, throughput);
|
||||||
|
x11_luffa512_cpu_init(thr_id, throughput);
|
||||||
|
x11_cubehash512_cpu_init(thr_id, throughput);
|
||||||
|
x11_shavite512_cpu_init(thr_id, throughput);
|
||||||
|
x11_simd512_cpu_init(thr_id, throughput);
|
||||||
|
x11_echo512_cpu_init(thr_id, throughput);
|
||||||
|
quark_check_cpu_init(thr_id, throughput);
|
||||||
|
init[thr_id] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t endiandata[20];
|
||||||
|
for (int k=0; k < 20; k++)
|
||||||
|
be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
|
||||||
|
|
||||||
|
quark_blake512_cpu_setBlock_80((void*)endiandata);
|
||||||
|
quark_check_cpu_setTarget(ptarget);
|
||||||
|
|
||||||
|
do {
|
||||||
|
int order = 0;
|
||||||
|
|
||||||
|
// erstes Blake512 Hash mit CUDA
|
||||||
|
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für BMW512
|
||||||
|
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für Groestl512
|
||||||
|
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für Skein512
|
||||||
|
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für JH512
|
||||||
|
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für Keccak512
|
||||||
|
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für Luffa512
|
||||||
|
x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für Cubehash512
|
||||||
|
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für Shavite512
|
||||||
|
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für SIMD512
|
||||||
|
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// das ist der unbedingte Branch für ECHO512
|
||||||
|
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
|
||||||
|
// Scan nach Gewinner Hashes auf der GPU
|
||||||
|
uint32_t foundNonce = quark_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
|
||||||
|
if (foundNonce != 0xffffffff)
|
||||||
|
{
|
||||||
|
uint32_t vhash64[8];
|
||||||
|
be32enc(&endiandata[19], foundNonce);
|
||||||
|
x11hash(vhash64, endiandata);
|
||||||
|
|
||||||
|
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
|
||||||
|
|
||||||
|
pdata[19] = foundNonce;
|
||||||
|
*hashes_done = foundNonce - first_nonce + 1;
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pdata[19] += throughput;
|
||||||
|
|
||||||
|
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
||||||
|
|
||||||
|
*hashes_done = pdata[19] - first_nonce + 1;
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user