update to version v0.5 (2014-03-27)
This commit is contained in:
parent
1bb78f0258
commit
2ca6ede92b
@ -34,4 +34,4 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
|
|||||||
|
|
||||||
# we're now targeting all major compute architectures within one binary.
|
# we're now targeting all major compute architectures within one binary.
|
||||||
.cu.o:
|
.cu.o:
|
||||||
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||||
|
@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS
|
|||||||
|
|
||||||
# we're now targeting all major compute architectures within one binary.
|
# we're now targeting all major compute architectures within one binary.
|
||||||
.cu.o:
|
.cu.o:
|
||||||
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||||
|
|
||||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||||
|
42
README.txt
42
README.txt
@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release
|
ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization"
|
||||||
-------------------------------------------------------------
|
-------------------------------------------------------------
|
||||||
|
|
||||||
***************************************************************
|
***************************************************************
|
||||||
@ -38,6 +38,11 @@ its command line interface and options.
|
|||||||
fugue256 use to mine Fuguecoin
|
fugue256 use to mine Fuguecoin
|
||||||
groestl use to mine Groestlcoin
|
groestl use to mine Groestlcoin
|
||||||
|
|
||||||
|
-d, --devices gives a comma separated list of CUDA device IDs
|
||||||
|
to operate on. Device IDs start counting from 0!
|
||||||
|
Alternatively give string names of your card like
|
||||||
|
gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
|
||||||
|
|
||||||
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ")
|
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ")
|
||||||
-O, --userpass=U:P username:password pair for mining server
|
-O, --userpass=U:P username:password pair for mining server
|
||||||
-u, --user=USERNAME username for mining server
|
-u, --user=USERNAME username for mining server
|
||||||
@ -63,8 +68,10 @@ its command line interface and options.
|
|||||||
-V, --version display version information and exit
|
-V, --version display version information and exit
|
||||||
-h, --help display this help text and exit
|
-h, --help display this help text and exit
|
||||||
|
|
||||||
|
|
||||||
>>> Examples <<<
|
>>> Examples <<<
|
||||||
|
|
||||||
|
|
||||||
Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
|
Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
|
||||||
|
|
||||||
ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
|
ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
|
||||||
@ -107,22 +114,33 @@ from your old clunkers.
|
|||||||
|
|
||||||
>>> RELEASE HISTORY <<<
|
>>> RELEASE HISTORY <<<
|
||||||
|
|
||||||
March, 24 2014 fixed Groestl pool support
|
March, 27 2014 Heavycoin exchange rates soar, and as a result this coin
|
||||||
|
gets some love: We greatly optimized the Hefty1 kernel
|
||||||
|
for speed. Expect some hefty gains, especially on 750Ti's!
|
||||||
|
|
||||||
went back to Compute 1.x for cuda_hefty1.cu kernel by
|
By popular demand, we added the -d option as known from
|
||||||
default after numerous reports of ccminer v0.2/v0.3
|
cudaminer.
|
||||||
not working with HeavyCoin for some people.
|
|
||||||
|
|
||||||
March, 23 2014 added Groestlcoin support. stratum status unknown
|
different compute capability builds are now provided until
|
||||||
(the only pool is currently down for fixing issues)
|
we figure out how to pack everything into a single executable
|
||||||
|
in a Windows build.
|
||||||
|
|
||||||
March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
|
March, 24 2014 fixed Groestl pool support
|
||||||
on Fermi and Maxwell devices. Kepler may suffer slightly
|
|
||||||
(3-5%)
|
|
||||||
|
|
||||||
Fixed Stratum for Fuguecoin. Tested on dwarfpool.
|
went back to Compute 1.x for cuda_hefty1.cu kernel by
|
||||||
|
default after numerous reports of ccminer v0.2/v0.3
|
||||||
|
not working with HeavyCoin for some people.
|
||||||
|
|
||||||
March, 18 2014 initial release.
|
March, 23 2014 added Groestlcoin support. stratum status unknown
|
||||||
|
(the only pool is currently down for fixing issues)
|
||||||
|
|
||||||
|
March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
|
||||||
|
on Fermi and Maxwell devices. Kepler may suffer slightly
|
||||||
|
(3-5%)
|
||||||
|
|
||||||
|
Fixed Stratum for Fuguecoin. Tested on dwarfpool.
|
||||||
|
|
||||||
|
March, 18 2014 initial release.
|
||||||
|
|
||||||
|
|
||||||
>>> AUTHORS <<<
|
>>> AUTHORS <<<
|
||||||
|
@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
|||||||
<CInterleavedPTX>true</CInterleavedPTX>
|
<CInterleavedPTX>true</CInterleavedPTX>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<MaxRegCount>63</MaxRegCount>
|
<MaxRegCount>80</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<PtxAsOptionV>true</PtxAsOptionV>
|
<PtxAsOptionV>true</PtxAsOptionV>
|
||||||
<Keep>true</Keep>
|
<Keep>true</Keep>
|
||||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
<CodeGeneration>compute_35,sm_35</CodeGeneration>
|
||||||
<Include>
|
<Include>
|
||||||
</Include>
|
</Include>
|
||||||
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
|||||||
<CInterleavedPTX>true</CInterleavedPTX>
|
<CInterleavedPTX>true</CInterleavedPTX>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<MaxRegCount>63</MaxRegCount>
|
<MaxRegCount>80</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<PtxAsOptionV>true</PtxAsOptionV>
|
<PtxAsOptionV>true</PtxAsOptionV>
|
||||||
<Keep>true</Keep>
|
<Keep>true</Keep>
|
||||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
<CodeGeneration>compute_35,sm_35</CodeGeneration>
|
||||||
<Include>
|
<Include>
|
||||||
</Include>
|
</Include>
|
||||||
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
|||||||
<CInterleavedPTX>true</CInterleavedPTX>
|
<CInterleavedPTX>true</CInterleavedPTX>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<MaxRegCount>63</MaxRegCount>
|
<MaxRegCount>80</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<PtxAsOptionV>true</PtxAsOptionV>
|
<PtxAsOptionV>true</PtxAsOptionV>
|
||||||
<Keep>true</Keep>
|
<Keep>true</Keep>
|
||||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
<CodeGeneration>compute_35,sm_35</CodeGeneration>
|
||||||
<Include>
|
<Include>
|
||||||
</Include>
|
</Include>
|
||||||
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
|||||||
<CInterleavedPTX>true</CInterleavedPTX>
|
<CInterleavedPTX>true</CInterleavedPTX>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<MaxRegCount>63</MaxRegCount>
|
<MaxRegCount>80</MaxRegCount>
|
||||||
</CudaCompile>
|
</CudaCompile>
|
||||||
<CudaCompile>
|
<CudaCompile>
|
||||||
<PtxAsOptionV>true</PtxAsOptionV>
|
<PtxAsOptionV>true</PtxAsOptionV>
|
||||||
<Keep>true</Keep>
|
<Keep>true</Keep>
|
||||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
<CodeGeneration>compute_35,sm_35</CodeGeneration>
|
||||||
<Include>
|
<Include>
|
||||||
</Include>
|
</Include>
|
||||||
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||||
@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
|||||||
<CudaCompile Include="cuda_fugue256.cu" />
|
<CudaCompile Include="cuda_fugue256.cu" />
|
||||||
<CudaCompile Include="cuda_groestl512.cu" />
|
<CudaCompile Include="cuda_groestl512.cu" />
|
||||||
<CudaCompile Include="cuda_groestlcoin.cu" />
|
<CudaCompile Include="cuda_groestlcoin.cu" />
|
||||||
<CudaCompile Include="cuda_hefty1.cu">
|
<CudaCompile Include="cuda_hefty1.cu" />
|
||||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
|
|
||||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
|
|
||||||
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">124</MaxRegCount>
|
|
||||||
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">124</MaxRegCount>
|
|
||||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
|
|
||||||
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">124</MaxRegCount>
|
|
||||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
|
|
||||||
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">124</MaxRegCount>
|
|
||||||
</CudaCompile>
|
|
||||||
<CudaCompile Include="cuda_keccak512.cu" />
|
<CudaCompile Include="cuda_keccak512.cu" />
|
||||||
<CudaCompile Include="cuda_sha256.cu" />
|
<CudaCompile Include="cuda_sha256.cu" />
|
||||||
<CudaCompile Include="heavy.cu" />
|
<CudaCompile Include="heavy.cu" />
|
||||||
|
20
configure
vendored
20
configure
vendored
@ -1,6 +1,6 @@
|
|||||||
#! /bin/sh
|
#! /bin/sh
|
||||||
# Guess values for system-dependent variables and create Makefiles.
|
# Guess values for system-dependent variables and create Makefiles.
|
||||||
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24.
|
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27.
|
||||||
#
|
#
|
||||||
#
|
#
|
||||||
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
|
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
|
||||||
@ -557,8 +557,8 @@ MAKEFLAGS=
|
|||||||
# Identity of this package.
|
# Identity of this package.
|
||||||
PACKAGE_NAME='ccminer'
|
PACKAGE_NAME='ccminer'
|
||||||
PACKAGE_TARNAME='ccminer'
|
PACKAGE_TARNAME='ccminer'
|
||||||
PACKAGE_VERSION='2014.03.24'
|
PACKAGE_VERSION='2014.03.27'
|
||||||
PACKAGE_STRING='ccminer 2014.03.24'
|
PACKAGE_STRING='ccminer 2014.03.27'
|
||||||
PACKAGE_BUGREPORT=''
|
PACKAGE_BUGREPORT=''
|
||||||
PACKAGE_URL=''
|
PACKAGE_URL=''
|
||||||
|
|
||||||
@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
|
|||||||
# Omit some internal or obsolete options to make the list less imposing.
|
# Omit some internal or obsolete options to make the list less imposing.
|
||||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||||
cat <<_ACEOF
|
cat <<_ACEOF
|
||||||
\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems.
|
\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems.
|
||||||
|
|
||||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||||
|
|
||||||
@ -1368,7 +1368,7 @@ fi
|
|||||||
|
|
||||||
if test -n "$ac_init_help"; then
|
if test -n "$ac_init_help"; then
|
||||||
case $ac_init_help in
|
case $ac_init_help in
|
||||||
short | recursive ) echo "Configuration of ccminer 2014.03.24:";;
|
short | recursive ) echo "Configuration of ccminer 2014.03.27:";;
|
||||||
esac
|
esac
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
|
|
||||||
@ -1469,7 +1469,7 @@ fi
|
|||||||
test -n "$ac_init_help" && exit $ac_status
|
test -n "$ac_init_help" && exit $ac_status
|
||||||
if $ac_init_version; then
|
if $ac_init_version; then
|
||||||
cat <<\_ACEOF
|
cat <<\_ACEOF
|
||||||
ccminer configure 2014.03.24
|
ccminer configure 2014.03.27
|
||||||
generated by GNU Autoconf 2.68
|
generated by GNU Autoconf 2.68
|
||||||
|
|
||||||
Copyright (C) 2010 Free Software Foundation, Inc.
|
Copyright (C) 2010 Free Software Foundation, Inc.
|
||||||
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
|
|||||||
This file contains any messages produced by compilers while
|
This file contains any messages produced by compilers while
|
||||||
running configure, to aid debugging if configure makes a mistake.
|
running configure, to aid debugging if configure makes a mistake.
|
||||||
|
|
||||||
It was created by ccminer $as_me 2014.03.24, which was
|
It was created by ccminer $as_me 2014.03.27, which was
|
||||||
generated by GNU Autoconf 2.68. Invocation command line was
|
generated by GNU Autoconf 2.68. Invocation command line was
|
||||||
|
|
||||||
$ $0 $@
|
$ $0 $@
|
||||||
@ -2901,7 +2901,7 @@ fi
|
|||||||
|
|
||||||
# Define the identity of the package.
|
# Define the identity of the package.
|
||||||
PACKAGE='ccminer'
|
PACKAGE='ccminer'
|
||||||
VERSION='2014.03.24'
|
VERSION='2014.03.27'
|
||||||
|
|
||||||
|
|
||||||
cat >>confdefs.h <<_ACEOF
|
cat >>confdefs.h <<_ACEOF
|
||||||
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
|||||||
# report actual input values of CONFIG_FILES etc. instead of their
|
# report actual input values of CONFIG_FILES etc. instead of their
|
||||||
# values after options handling.
|
# values after options handling.
|
||||||
ac_log="
|
ac_log="
|
||||||
This file was extended by ccminer $as_me 2014.03.24, which was
|
This file was extended by ccminer $as_me 2014.03.27, which was
|
||||||
generated by GNU Autoconf 2.68. Invocation command line was
|
generated by GNU Autoconf 2.68. Invocation command line was
|
||||||
|
|
||||||
CONFIG_FILES = $CONFIG_FILES
|
CONFIG_FILES = $CONFIG_FILES
|
||||||
@ -7184,7 +7184,7 @@ _ACEOF
|
|||||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||||
ac_cs_version="\\
|
ac_cs_version="\\
|
||||||
ccminer config.status 2014.03.24
|
ccminer config.status 2014.03.27
|
||||||
configured by $0, generated by GNU Autoconf 2.68,
|
configured by $0, generated by GNU Autoconf 2.68,
|
||||||
with options \\"\$ac_cs_config\\"
|
with options \\"\$ac_cs_config\\"
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
AC_INIT([ccminer], [2014.03.24])
|
AC_INIT([ccminer], [2014.03.27])
|
||||||
|
|
||||||
AC_PREREQ([2.59c])
|
AC_PREREQ([2.59c])
|
||||||
AC_CANONICAL_SYSTEM
|
AC_CANONICAL_SYSTEM
|
||||||
|
50
cpu-miner.c
50
cpu-miner.c
@ -51,8 +51,13 @@
|
|||||||
// from heavy.cu
|
// from heavy.cu
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C"
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
int cuda_num_devices();
|
int cuda_num_devices();
|
||||||
|
int cuda_finddevice(char *name);
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef __linux /* Linux specific policy and affinity management */
|
#ifdef __linux /* Linux specific policy and affinity management */
|
||||||
@ -144,10 +149,11 @@ static int opt_scantime = 5;
|
|||||||
static json_t *opt_config;
|
static json_t *opt_config;
|
||||||
static const bool opt_time = true;
|
static const bool opt_time = true;
|
||||||
static sha256_algos opt_algo = ALGO_HEAVY;
|
static sha256_algos opt_algo = ALGO_HEAVY;
|
||||||
static int opt_n_threads;
|
static int opt_n_threads = 0;
|
||||||
bool opt_trust_pool = false;
|
bool opt_trust_pool = false;
|
||||||
uint16_t opt_vote = 9999;
|
uint16_t opt_vote = 9999;
|
||||||
static int num_processors;
|
static int num_processors;
|
||||||
|
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
|
||||||
static char *rpc_url;
|
static char *rpc_url;
|
||||||
static char *rpc_userpass;
|
static char *rpc_userpass;
|
||||||
static char *rpc_user, *rpc_pass;
|
static char *rpc_user, *rpc_pass;
|
||||||
@ -185,7 +191,11 @@ Options:\n\
|
|||||||
-a, --algo=ALGO specify the algorithm to use\n\
|
-a, --algo=ALGO specify the algorithm to use\n\
|
||||||
fugue256 Fuguecoin hash\n\
|
fugue256 Fuguecoin hash\n\
|
||||||
heavy Heavycoin hash\n\
|
heavy Heavycoin hash\n\
|
||||||
-v, --vote=VOTE block reward vote\n\
|
-d, --devices takes a comma separated list of CUDA devices to use.\n\
|
||||||
|
Device IDs start counting from 0! Alternatively takes\n\
|
||||||
|
string names of your cards like gtx780ti or gt640#2\n\
|
||||||
|
(matching 2nd gt640 in the PC)\n\
|
||||||
|
-v, --vote=VOTE block reward vote (for HeavyCoin)\n\
|
||||||
-m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
|
-m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
|
||||||
-o, --url=URL URL of mining server\n\
|
-o, --url=URL URL of mining server\n\
|
||||||
-O, --userpass=U:P username:password pair for mining server\n\
|
-O, --userpass=U:P username:password pair for mining server\n\
|
||||||
@ -227,7 +237,7 @@ static char const short_options[] =
|
|||||||
#ifdef HAVE_SYSLOG_H
|
#ifdef HAVE_SYSLOG_H
|
||||||
"S"
|
"S"
|
||||||
#endif
|
#endif
|
||||||
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:";
|
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
|
||||||
|
|
||||||
static struct option const options[] = {
|
static struct option const options[] = {
|
||||||
{ "algo", 1, NULL, 'a' },
|
{ "algo", 1, NULL, 'a' },
|
||||||
@ -259,6 +269,7 @@ static struct option const options[] = {
|
|||||||
{ "user", 1, NULL, 'u' },
|
{ "user", 1, NULL, 'u' },
|
||||||
{ "userpass", 1, NULL, 'O' },
|
{ "userpass", 1, NULL, 'O' },
|
||||||
{ "version", 0, NULL, 'V' },
|
{ "version", 0, NULL, 'V' },
|
||||||
|
{ "devices", 1, NULL, 'd' },
|
||||||
{ 0, 0, 0, 0 }
|
{ 0, 0, 0, 0 }
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg)
|
|||||||
case 'S':
|
case 'S':
|
||||||
use_syslog = true;
|
use_syslog = true;
|
||||||
break;
|
break;
|
||||||
|
case 'd': // CB
|
||||||
|
{
|
||||||
|
char * pch = strtok (arg,",");
|
||||||
|
opt_n_threads = 0;
|
||||||
|
while (pch != NULL) {
|
||||||
|
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
|
||||||
|
{
|
||||||
|
if (atoi(pch) < num_processors)
|
||||||
|
device_map[opt_n_threads++] = atoi(pch);
|
||||||
|
else {
|
||||||
|
applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
int device = cuda_finddevice(pch);
|
||||||
|
if (device >= 0 && device < num_processors)
|
||||||
|
device_map[opt_n_threads++] = device;
|
||||||
|
else {
|
||||||
|
applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pch = strtok (NULL, ",");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 'V':
|
case 'V':
|
||||||
show_version_and_exit();
|
show_version_and_exit();
|
||||||
case 'h':
|
case 'h':
|
||||||
@ -1346,7 +1383,7 @@ static void signal_handler(int sig)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define PROGRAM_VERSION "0.4"
|
#define PROGRAM_VERSION "0.5"
|
||||||
int main(int argc, char *argv[])
|
int main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
struct thr_info *thr;
|
struct thr_info *thr;
|
||||||
@ -1370,6 +1407,9 @@ int main(int argc, char *argv[])
|
|||||||
rpc_user = strdup("");
|
rpc_user = strdup("");
|
||||||
rpc_pass = strdup("");
|
rpc_pass = strdup("");
|
||||||
|
|
||||||
|
pthread_mutex_init(&applog_lock, NULL);
|
||||||
|
num_processors = cuda_num_devices();
|
||||||
|
|
||||||
/* parse command line */
|
/* parse command line */
|
||||||
parse_cmdline(argc, argv);
|
parse_cmdline(argc, argv);
|
||||||
|
|
||||||
@ -1385,7 +1425,6 @@ int main(int argc, char *argv[])
|
|||||||
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
|
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
|
||||||
}
|
}
|
||||||
|
|
||||||
pthread_mutex_init(&applog_lock, NULL);
|
|
||||||
pthread_mutex_init(&stats_lock, NULL);
|
pthread_mutex_init(&stats_lock, NULL);
|
||||||
pthread_mutex_init(&g_work_lock, NULL);
|
pthread_mutex_init(&g_work_lock, NULL);
|
||||||
pthread_mutex_init(&stratum.sock_lock, NULL);
|
pthread_mutex_init(&stratum.sock_lock, NULL);
|
||||||
@ -1416,7 +1455,6 @@ int main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
num_processors = cuda_num_devices();
|
|
||||||
if (num_processors == 0)
|
if (num_processors == 0)
|
||||||
{
|
{
|
||||||
applog(LOG_ERR, "No CUDA devices found! terminating.");
|
applog(LOG_ERR, "No CUDA devices found! terminating.");
|
||||||
|
@ -152,7 +152,7 @@
|
|||||||
#define PACKAGE_NAME "ccminer"
|
#define PACKAGE_NAME "ccminer"
|
||||||
|
|
||||||
/* Define to the full name and version of this package. */
|
/* Define to the full name and version of this package. */
|
||||||
#define PACKAGE_STRING "ccminer 2014.03.24"
|
#define PACKAGE_STRING "ccminer 2014.03.27"
|
||||||
|
|
||||||
/* Define to the one symbol short name of this package. */
|
/* Define to the one symbol short name of this package. */
|
||||||
#undef PACKAGE_TARNAME
|
#undef PACKAGE_TARNAME
|
||||||
@ -161,7 +161,7 @@
|
|||||||
#undef PACKAGE_URL
|
#undef PACKAGE_URL
|
||||||
|
|
||||||
/* Define to the version of this package. */
|
/* Define to the version of this package. */
|
||||||
#define PACKAGE_VERSION "2014.03.24"
|
#define PACKAGE_VERSION "2014.03.27"
|
||||||
|
|
||||||
/* If using the C implementation of alloca, define if you know the
|
/* If using the C implementation of alloca, define if you know the
|
||||||
direction of stack growth for your system; otherwise it will be
|
direction of stack growth for your system; otherwise it will be
|
||||||
|
@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata)
|
|||||||
|
|
||||||
__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||||
{
|
{
|
||||||
const int threadsperblock = 128;
|
const int threadsperblock = 256;
|
||||||
|
|
||||||
// berechne wie viele Thread Blocks wir brauchen
|
// berechne wie viele Thread Blocks wir brauchen
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
@ -9,7 +9,10 @@
|
|||||||
|
|
||||||
#define USE_SHARED 1
|
#define USE_SHARED 1
|
||||||
|
|
||||||
// heavy.cu
|
// aus cpu-miner.c
|
||||||
|
extern int device_map[8];
|
||||||
|
|
||||||
|
// aus heavy.cu
|
||||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||||
|
|
||||||
// Folgende Definitionen später durch header ersetzen
|
// Folgende Definitionen später durch header ersetzen
|
||||||
@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
|
|||||||
|
|
||||||
void fugue256_cpu_init(int thr_id, int threads)
|
void fugue256_cpu_init(int thr_id, int threads)
|
||||||
{
|
{
|
||||||
cudaSetDevice(thr_id);
|
cudaSetDevice(device_map[thr_id]);
|
||||||
|
|
||||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||||
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
|
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
|
||||||
@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
size_t shared_size = 4 * 256 * sizeof(uint32_t);
|
size_t shared_size = 4 * 256 * sizeof(uint32_t);
|
||||||
#else
|
#else
|
||||||
|
@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
@ -9,6 +9,10 @@
|
|||||||
|
|
||||||
#define USE_SHARED 1
|
#define USE_SHARED 1
|
||||||
|
|
||||||
|
// aus cpu-miner.c
|
||||||
|
extern int device_map[8];
|
||||||
|
|
||||||
|
// aus heavy.cu
|
||||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||||
|
|
||||||
// Folgende Definitionen später durch header ersetzen
|
// Folgende Definitionen später durch header ersetzen
|
||||||
@ -20,13 +24,7 @@ typedef unsigned long long uint64_t;
|
|||||||
__constant__ uint32_t pTarget[8]; // Single GPU
|
__constant__ uint32_t pTarget[8]; // Single GPU
|
||||||
extern uint32_t *d_resultNonce[8];
|
extern uint32_t *d_resultNonce[8];
|
||||||
|
|
||||||
// globaler Speicher für unsere Ergebnisse
|
|
||||||
uint32_t *d_hashGROESTLCOINoutput[8];
|
|
||||||
|
|
||||||
__constant__ uint32_t groestlcoin_gpu_state[32];
|
|
||||||
__constant__ uint32_t groestlcoin_gpu_msg[32];
|
__constant__ uint32_t groestlcoin_gpu_msg[32];
|
||||||
__constant__ uint32_t sha256coin_gpu_constantTable[64];
|
|
||||||
__constant__ uint32_t sha256coin_gpu_register[8];
|
|
||||||
|
|
||||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||||
|
|
||||||
@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[];
|
|||||||
extern uint32_t T3up_cpu[];
|
extern uint32_t T3up_cpu[];
|
||||||
extern uint32_t T3dn_cpu[];
|
extern uint32_t T3dn_cpu[];
|
||||||
|
|
||||||
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
#if __CUDA_ARCH__ < 350
|
||||||
|
// Kepler (Compute 3.0)
|
||||||
|
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||||
|
#else
|
||||||
|
// Kepler (Compute 3.5)
|
||||||
|
#define S(x, n) __funnelshift_r( x, x, n );
|
||||||
|
#endif
|
||||||
#define R(x, n) ((x) >> (n))
|
#define R(x, n) ((x) >> (n))
|
||||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||||
@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[];
|
|||||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||||
|
|
||||||
|
|
||||||
__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
|
__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
|
||||||
{
|
{
|
||||||
uint32_t t[32];
|
uint32_t t[32];
|
||||||
|
|
||||||
//#pragma unroll 14
|
//#pragma unroll 14
|
||||||
for(int r=0;r<14;r++)
|
for(int r=0;r<14;r++)
|
||||||
{
|
{
|
||||||
#pragma unroll 16
|
switch(r)
|
||||||
for(int k=0;k<16;k++)
|
|
||||||
{
|
{
|
||||||
a[(k*2)+0] ^= PC32up(k * 0x10, r);
|
case 0:
|
||||||
//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
|
||||||
|
case 1:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
|
||||||
|
case 2:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
|
||||||
|
case 3:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
|
||||||
|
case 4:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
|
||||||
|
case 5:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
|
||||||
|
case 6:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
|
||||||
|
case 7:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
|
||||||
|
case 8:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
|
||||||
|
case 9:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
|
||||||
|
case 10:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
|
||||||
|
case 11:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
|
||||||
|
case 12:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
|
||||||
|
case 13:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// RBTT
|
// RBTT
|
||||||
@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
|
__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
|
||||||
{
|
{
|
||||||
//#pragma unroll 14
|
//#pragma unroll 14
|
||||||
for(int r=0;r<14;r++)
|
for(int r=0;r<14;r++)
|
||||||
{
|
{
|
||||||
uint32_t t[32];
|
uint32_t t[32];
|
||||||
|
|
||||||
#pragma unroll 16
|
switch(r)
|
||||||
for(int k=0;k<16;k++)
|
|
||||||
{
|
{
|
||||||
a[(k*2)+0] ^= QC32up(k * 0x10, r);
|
case 0:
|
||||||
a[(k*2)+1] ^= QC32dn(k * 0x10, r);
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
|
||||||
|
case 1:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
|
||||||
|
case 2:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
|
||||||
|
case 3:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
|
||||||
|
case 4:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
|
||||||
|
case 5:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
|
||||||
|
case 6:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
|
||||||
|
case 7:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
|
||||||
|
case 8:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
|
||||||
|
case 9:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
|
||||||
|
case 10:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
|
||||||
|
case 11:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
|
||||||
|
case 12:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
|
||||||
|
case 13:
|
||||||
|
#pragma unroll 16
|
||||||
|
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// RBTT
|
// RBTT
|
||||||
@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
__global__ void __launch_bounds__(256)
|
__global__ void /* __launch_bounds__(256) */
|
||||||
#else
|
#else
|
||||||
__global__ void
|
__global__ void
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
|
groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
|
||||||
{
|
{
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
extern __shared__ char mixtabs[];
|
extern __shared__ char mixtabs[];
|
||||||
@ -204,146 +286,111 @@ __global__ void
|
|||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
if (thread < threads)
|
if (thread < threads)
|
||||||
{
|
{
|
||||||
/////
|
|
||||||
///// Lieber groestl, mach, dass es abgeht!!!
|
|
||||||
/////
|
|
||||||
// GROESTL
|
// GROESTL
|
||||||
uint32_t message[32];
|
uint32_t message[32];
|
||||||
uint32_t state[32];
|
uint32_t state[32];
|
||||||
uint32_t g[32];
|
|
||||||
|
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int k=0;k<32;k++)
|
for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
|
||||||
{
|
|
||||||
// TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem
|
|
||||||
// constant Memory zu lesen. Das ist Verschwendung von Bandbreite.
|
|
||||||
state[k] = groestlcoin_gpu_state[k];
|
|
||||||
message[k] = groestlcoin_gpu_msg[k];
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t nounce = startNounce + thread;
|
uint32_t nounce = startNounce + thread;
|
||||||
message[19] = SWAB32(nounce);
|
message[19] = SWAB32(nounce);
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int u=0;u<32;u++)
|
for(int u=0;u<32;u++) state[u] = message[u];
|
||||||
g[u] = message[u] ^ state[u]; // TODO: state ist fast ueberall 0.
|
state[31] ^= 0x20000;
|
||||||
|
|
||||||
// Perm
|
// Perm
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
groestlcoin_perm_P(g, mixtabs); // TODO: g[] entspricht fast genau message[]
|
groestlcoin_perm_P(state, mixtabs);
|
||||||
groestlcoin_perm_Q(message, mixtabs); // kann man das ausnutzen?
|
state[31] ^= 0x20000;
|
||||||
|
groestlcoin_perm_Q(message, mixtabs);
|
||||||
#else
|
#else
|
||||||
groestlcoin_perm_P(g, NULL);
|
groestlcoin_perm_P(state, NULL);
|
||||||
|
state[31] ^= 0x20000;
|
||||||
groestlcoin_perm_Q(message, NULL);
|
groestlcoin_perm_Q(message, NULL);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int u=0;u<32;u++)
|
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||||
{
|
|
||||||
// TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion
|
#pragma unroll 32
|
||||||
// was den Registerbedarf senken koennte?
|
for(int u=0;u<32;u++) message[u] = state[u];
|
||||||
state[u] ^= g[u] ^ message[u];
|
|
||||||
g[u] = state[u];
|
|
||||||
}
|
|
||||||
|
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
groestlcoin_perm_P(g, mixtabs);
|
groestlcoin_perm_P(message, mixtabs);
|
||||||
#else
|
#else
|
||||||
groestlcoin_perm_P(g, NULL);
|
groestlcoin_perm_P(message, NULL);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int u=0;u<32;u++)
|
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||||
state[u] ^= g[u];
|
|
||||||
|
|
||||||
////
|
////
|
||||||
//// 2. Runde groestl
|
//// 2. Runde groestl
|
||||||
////
|
////
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int k=0;k<16;k++)
|
for(int k=0;k<16;k++) message[k] = state[k + 16];
|
||||||
message[k] = state[k + 16];
|
#pragma unroll 14
|
||||||
|
for(int k=1;k<15;k++)
|
||||||
#pragma unroll 32
|
|
||||||
for(int k=0;k<32;k++)
|
|
||||||
state[k] = groestlcoin_gpu_state[k];
|
|
||||||
|
|
||||||
#pragma unroll 16
|
|
||||||
for(int k=0;k<16;k++)
|
|
||||||
message[k+16] = 0;
|
message[k+16] = 0;
|
||||||
|
|
||||||
message[16] = 0x80;
|
message[16] = 0x80;
|
||||||
message[31] = 0x01000000;
|
message[31] = 0x01000000;
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int u=0;u<32;u++)
|
for(int u=0;u<32;u++)
|
||||||
g[u] = message[u] ^ state[u];
|
state[u] = message[u];
|
||||||
|
state[31] ^= 0x20000;
|
||||||
|
|
||||||
// Perm
|
// Perm
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
groestlcoin_perm_P(g, mixtabs);
|
groestlcoin_perm_P(state, mixtabs);
|
||||||
|
state[31] ^= 0x20000;
|
||||||
groestlcoin_perm_Q(message, mixtabs);
|
groestlcoin_perm_Q(message, mixtabs);
|
||||||
#else
|
#else
|
||||||
groestlcoin_perm_P(g, NULL);
|
groestlcoin_perm_P(state, NULL);
|
||||||
|
state[31] ^= 0x20000;
|
||||||
groestlcoin_perm_Q(message, NULL);
|
groestlcoin_perm_Q(message, NULL);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int u=0;u<32;u++)
|
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||||
{
|
|
||||||
state[u] ^= g[u] ^ message[u];
|
#pragma unroll 32
|
||||||
g[u] = state[u];
|
for(int u=0;u<32;u++) message[u] = state[u];
|
||||||
}
|
|
||||||
|
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
groestlcoin_perm_P(g, mixtabs);
|
groestlcoin_perm_P(message, mixtabs);
|
||||||
#else
|
#else
|
||||||
groestlcoin_perm_P(g, NULL);
|
groestlcoin_perm_P(message, NULL);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#pragma unroll 32
|
#pragma unroll 32
|
||||||
for(int u=0;u<32;u++)
|
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||||
state[u] ^= g[u];
|
|
||||||
|
|
||||||
/*
|
|
||||||
#pragma unroll 8
|
|
||||||
for(int k=0;k<8;k++)
|
|
||||||
hash[k] = state[k+16];
|
|
||||||
*/
|
|
||||||
|
|
||||||
// kopiere Ergebnis
|
// kopiere Ergebnis
|
||||||
/*
|
int i, position = -1;
|
||||||
#pragma unroll 16
|
|
||||||
for(int k=0;k<16;k++)
|
|
||||||
((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
|
|
||||||
*/
|
|
||||||
int i;
|
|
||||||
bool rc = true;
|
bool rc = true;
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
for (i = 7; i >= 0; i--) {
|
for (i = 7; i >= 0; i--) {
|
||||||
if (state[i+16] > pTarget[i]) {
|
if (state[i+16] > pTarget[i]) {
|
||||||
rc = false;
|
if(position < i) {
|
||||||
break;
|
position = i;
|
||||||
}
|
rc = false;
|
||||||
if (state[i+16] < pTarget[i]) {
|
}
|
||||||
rc = true;
|
}
|
||||||
break;
|
if (state[i+16] < pTarget[i]) {
|
||||||
}
|
if(position < i) {
|
||||||
|
position = i;
|
||||||
|
rc = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(rc == true)
|
if(rc == true)
|
||||||
{
|
|
||||||
if(resNounce[0] > nounce)
|
if(resNounce[0] > nounce)
|
||||||
{
|
|
||||||
resNounce[0] = nounce;
|
resNounce[0] = nounce;
|
||||||
/*
|
|
||||||
#pragma unroll 8
|
|
||||||
for(int k=0;k<8;k++)
|
|
||||||
((uint32_t*)outputHash)[k] = (hash[k]);
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -360,7 +407,7 @@ __global__ void
|
|||||||
// Setup-Funktionen
|
// Setup-Funktionen
|
||||||
__host__ void groestlcoin_cpu_init(int thr_id, int threads)
|
__host__ void groestlcoin_cpu_init(int thr_id, int threads)
|
||||||
{
|
{
|
||||||
cudaSetDevice(thr_id);
|
cudaSetDevice(device_map[thr_id]);
|
||||||
cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
|
cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
|
||||||
// Texturen mit obigem Makro initialisieren
|
// Texturen mit obigem Makro initialisieren
|
||||||
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
|
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
|
||||||
@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
|
|||||||
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
|
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
|
||||||
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
|
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
|
||||||
|
|
||||||
// setze register
|
// Speicher für Gewinner-Nonce belegen
|
||||||
// TODO: fast vollstaendige Vorbelegung mit Nullen.
|
|
||||||
// da besteht doch Optimierungspotenzial im GPU Kernel
|
|
||||||
// denn mit Nullen braucht man nicht wirklich rechnen.
|
|
||||||
uint32_t groestl_state_init[32];
|
|
||||||
memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
|
|
||||||
groestl_state_init[31] = 0x20000;
|
|
||||||
|
|
||||||
// state speichern
|
|
||||||
cudaMemcpyToSymbol( groestlcoin_gpu_state,
|
|
||||||
groestl_state_init,
|
|
||||||
128);
|
|
||||||
|
|
||||||
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
||||||
|
|
||||||
// Speicher für alle Ergebnisse belegen (nur für Debug)
|
|
||||||
cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
|
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
|
||||||
@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
size_t shared_size = 8 * 256 * sizeof(uint32_t);
|
size_t shared_size = 8 * 256 * sizeof(uint32_t);
|
||||||
#else
|
#else
|
||||||
@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
|
|||||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
//fprintf(stderr, "ThrID: %d\n", thr_id);
|
//fprintf(stderr, "ThrID: %d\n", thr_id);
|
||||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||||
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]);
|
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
|
||||||
|
|
||||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||||
MyStreamSynchronize(NULL, 0, thr_id);
|
MyStreamSynchronize(NULL, 0, thr_id);
|
||||||
|
|
||||||
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
/// Debug
|
|
||||||
//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
|
|
||||||
|
|
||||||
// Nounce
|
|
||||||
//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
|
||||||
}
|
}
|
||||||
|
521
cuda_hefty1.cu
521
cuda_hefty1.cu
@ -2,26 +2,40 @@
|
|||||||
#include "cuda_runtime.h"
|
#include "cuda_runtime.h"
|
||||||
#include "device_launch_parameters.h"
|
#include "device_launch_parameters.h"
|
||||||
|
|
||||||
|
// aus cpu-miner.c
|
||||||
|
extern int device_map[8];
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
|
|
||||||
|
#define USE_SHARED 1
|
||||||
|
|
||||||
// Folgende Definitionen später durch header ersetzen
|
// Folgende Definitionen später durch header ersetzen
|
||||||
typedef unsigned int uint32_t;
|
typedef unsigned int uint32_t;
|
||||||
typedef unsigned char uint8_t;
|
typedef unsigned char uint8_t;
|
||||||
typedef unsigned short uint16_t;
|
typedef unsigned short uint16_t;
|
||||||
|
|
||||||
|
// diese Struktur wird in der Init Funktion angefordert
|
||||||
|
static cudaDeviceProp props;
|
||||||
|
|
||||||
// globaler Speicher für alle HeftyHashes aller Threads
|
// globaler Speicher für alle HeftyHashes aller Threads
|
||||||
uint32_t *d_heftyHashes[8];
|
uint32_t *d_heftyHashes[8];
|
||||||
|
|
||||||
/* Hash-Tabellen */
|
/* Hash-Tabellen */
|
||||||
__constant__ uint32_t hefty_gpu_constantTable[64];
|
__constant__ uint32_t hefty_gpu_constantTable[64];
|
||||||
|
#if USE_SHARED
|
||||||
|
#define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
|
||||||
|
#else
|
||||||
|
#define heftyLookUp(x) hefty_gpu_constantTable[x]
|
||||||
|
#endif
|
||||||
|
|
||||||
// muss expandiert werden
|
// muss expandiert werden
|
||||||
__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
|
__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
|
||||||
__constant__ uint32_t hefty_gpu_register[8];
|
__constant__ uint32_t hefty_gpu_register[8];
|
||||||
__constant__ uint32_t hefty_gpu_sponge[4];
|
__constant__ uint32_t hefty_gpu_sponge[4];
|
||||||
|
|
||||||
uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
|
uint32_t hefty_cpu_hashTable[] = {
|
||||||
|
0x6a09e667UL,
|
||||||
0xbb67ae85UL,
|
0xbb67ae85UL,
|
||||||
0x3c6ef372UL,
|
0x3c6ef372UL,
|
||||||
0xa54ff53aUL,
|
0xa54ff53aUL,
|
||||||
@ -29,8 +43,9 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
|
|||||||
0x9b05688cUL,
|
0x9b05688cUL,
|
||||||
0x1f83d9abUL,
|
0x1f83d9abUL,
|
||||||
0x5be0cd19UL };
|
0x5be0cd19UL };
|
||||||
|
|
||||||
uint32_t hefty_cpu_constantTable[] = {
|
uint32_t hefty_cpu_constantTable[] = {
|
||||||
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
|
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
|
||||||
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
|
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
|
||||||
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
|
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
|
||||||
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
|
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
|
||||||
@ -48,350 +63,352 @@ uint32_t hefty_cpu_constantTable[] = {
|
|||||||
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
|
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
|
||||||
};
|
};
|
||||||
|
|
||||||
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||||
#define R(x, n) ((x) >> (n))
|
static __host__ __device__ uint32_t S(uint32_t x, int n)
|
||||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
{
|
||||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
return (((x) >> (n)) | ((x) << (32 - (n))));
|
||||||
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
|
}
|
||||||
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
|
#define R(x, n) ((x) >> (n))
|
||||||
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
|
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||||
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
|
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||||
|
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
|
||||||
|
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
|
||||||
|
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
|
||||||
|
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
|
||||||
|
|
||||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||||
|
|
||||||
// uint8_t
|
// uint8_t
|
||||||
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
|
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
|
||||||
__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
|
__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
|
||||||
{
|
{
|
||||||
uint16_t w = (x >> 16) ^ (x & 0xffff);
|
uint16_t w = (x >> 16) ^ (x & 0xffff);
|
||||||
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
|
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
|
||||||
return (n >> 2) ^ (n & 0x03);
|
return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
|
||||||
}
|
}
|
||||||
// 4 auf einmal
|
// 4 auf einmal
|
||||||
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
|
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
|
||||||
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
|
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
|
||||||
|
|
||||||
__host__ __device__ void Mangle(uint32_t *inp)
|
__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
|
||||||
{
|
{
|
||||||
uint32_t r = smoosh4Quad(inp[0]);
|
uint32_t r = smoosh4Quad(inp[0]);
|
||||||
//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) );
|
uint32_t inp0org;
|
||||||
//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) );
|
uint32_t tmp0Mask, tmp1Mask;
|
||||||
//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) );
|
uint32_t in1, in2, isAddition;
|
||||||
//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) );
|
uint32_t tmp;
|
||||||
|
uint8_t b;
|
||||||
|
|
||||||
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
|
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
|
||||||
|
|
||||||
switch (smoosh2(inp[1])) {
|
r += 0x01010101;
|
||||||
case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break;
|
tmp = smoosh2(inp[1]);
|
||||||
case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break;
|
b = getByte(r,tmp);
|
||||||
case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break;
|
inp0org = S(inp[0], b);
|
||||||
case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break;
|
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
|
||||||
}
|
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
|
||||||
|
|
||||||
uint32_t tmp = smoosh2(inp[1] ^ inp[2]);
|
in1 = (inp[2] & ~inp0org) |
|
||||||
switch (tmp) {
|
(tmp1Mask & ~inp[2] & inp0org) |
|
||||||
case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break;
|
(~tmp0Mask & ~inp[2] & inp0org);
|
||||||
case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break;
|
in2 = inp[2] += ~inp0org;
|
||||||
case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break;
|
isAddition = ~tmp0Mask & tmp1Mask;
|
||||||
case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break;
|
inp[2] = isAddition ? in2 : in1;
|
||||||
}
|
|
||||||
|
r += 0x01010101;
|
||||||
|
tmp = smoosh2(inp[1] ^ inp[2]);
|
||||||
|
b = getByte(r,tmp);
|
||||||
|
inp0org = S(inp[0], b);
|
||||||
|
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
|
||||||
|
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
|
||||||
|
|
||||||
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
|
in1 = (inp[3] & ~inp0org) |
|
||||||
|
(tmp1Mask & ~inp[3] & inp0org) |
|
||||||
|
(~tmp0Mask & ~inp[3] & inp0org);
|
||||||
|
in2 = inp[3] += ~inp0org;
|
||||||
|
isAddition = ~tmp0Mask & tmp1Mask;
|
||||||
|
inp[3] = isAddition ? in2 : in1;
|
||||||
|
|
||||||
|
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
|
__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
|
||||||
{
|
{
|
||||||
inp[0] ^= x;
|
inp[0] ^= x;
|
||||||
Mangle(inp);
|
Mangle(inp);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
|
__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
|
||||||
{
|
{
|
||||||
uint32_t y = inp[0];
|
uint32_t y = inp[0];
|
||||||
Mangle(inp);
|
Mangle(inp);
|
||||||
return y;
|
return y;
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
|
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
|
||||||
{
|
{
|
||||||
uint32_t r = Squeeze(sponge);
|
uint32_t r = Squeeze(sponge);
|
||||||
|
uint32_t t = ((r >> 8) & 0x1F);
|
||||||
|
uint32_t y = 1 << t;
|
||||||
|
|
||||||
//uint8_t r0 = r >> 8;
|
uint32_t a = (((r>>1) & 0x01) << t) & y;
|
||||||
uint8_t r1 = r & 0xFF;
|
uint32_t b = ((r & 0x01) << t) & y;
|
||||||
uint32_t y = 1 << ((r >> 8) & 0x1F);
|
uint32_t c = x & y;
|
||||||
|
|
||||||
//uint32_t retVal;
|
uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
|
||||||
//retVal = x;
|
return retVal;
|
||||||
|
|
||||||
uint32_t resArr[4];
|
|
||||||
resArr[0] = x;
|
|
||||||
resArr[1] = x & ~y;
|
|
||||||
resArr[2] = x | y;
|
|
||||||
resArr[3] = x ^ y;
|
|
||||||
return resArr[r1 & 0x03];
|
|
||||||
|
|
||||||
/*
|
|
||||||
switch(r1 & 0x03)
|
|
||||||
{
|
|
||||||
case 0:
|
|
||||||
break;
|
|
||||||
case 1:
|
|
||||||
retVal = x & ~y;
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
retVal = x | y;
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
retVal = x ^ y;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return retVal;
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
||||||
{
|
{
|
||||||
uint32_t tmpBr;
|
uint32_t tmpBr;
|
||||||
|
|
||||||
uint32_t brG = Br(sponge, regs[6]);
|
uint32_t brG = Br(sponge, regs[6]);
|
||||||
uint32_t brF = Br(sponge, regs[5]);
|
uint32_t brF = Br(sponge, regs[5]);
|
||||||
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
||||||
uint32_t brE = Br(sponge, regs[4]);
|
uint32_t brE = Br(sponge, regs[4]);
|
||||||
uint32_t tmp2 = tmp1 + S1(brE);
|
uint32_t tmp2 = tmp1 + S1(brE);
|
||||||
uint32_t brC = Br(sponge, regs[2]);
|
uint32_t brC = Br(sponge, regs[2]);
|
||||||
uint32_t brB = Br(sponge, regs[1]);
|
uint32_t brB = Br(sponge, regs[1]);
|
||||||
uint32_t brA = Br(sponge, regs[0]);
|
uint32_t brA = Br(sponge, regs[0]);
|
||||||
uint32_t tmp3 = Maj(brA, brB, brC);
|
uint32_t tmp3 = Maj(brA, brB, brC);
|
||||||
tmpBr = Br(sponge, regs[0]);
|
tmpBr = Br(sponge, regs[0]);
|
||||||
uint32_t tmp4 = tmp3 + S0(tmpBr);
|
uint32_t tmp4 = tmp3 + S0(tmpBr);
|
||||||
tmpBr = Br(sponge, tmp2);
|
tmpBr = Br(sponge, tmp2);
|
||||||
|
|
||||||
#pragma unroll 7
|
#pragma unroll 7
|
||||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||||
regs[0] = tmp2 + tmp4;
|
regs[0] = tmp2 + tmp4;
|
||||||
regs[4] += tmpBr;
|
regs[4] += tmpBr;
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
||||||
{
|
{
|
||||||
uint32_t tmpBr;
|
uint32_t tmpBr;
|
||||||
|
|
||||||
uint32_t brG = Br(sponge, regs[6]);
|
uint32_t brG = Br(sponge, regs[6]);
|
||||||
uint32_t brF = Br(sponge, regs[5]);
|
uint32_t brF = Br(sponge, regs[5]);
|
||||||
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
||||||
uint32_t brE = Br(sponge, regs[4]);
|
uint32_t brE = Br(sponge, regs[4]);
|
||||||
uint32_t tmp2 = tmp1 + S1(brE);
|
uint32_t tmp2 = tmp1 + S1(brE);
|
||||||
uint32_t brC = Br(sponge, regs[2]);
|
uint32_t brC = Br(sponge, regs[2]);
|
||||||
uint32_t brB = Br(sponge, regs[1]);
|
uint32_t brB = Br(sponge, regs[1]);
|
||||||
uint32_t brA = Br(sponge, regs[0]);
|
uint32_t brA = Br(sponge, regs[0]);
|
||||||
uint32_t tmp3 = Maj(brA, brB, brC);
|
uint32_t tmp3 = Maj(brA, brB, brC);
|
||||||
tmpBr = Br(sponge, regs[0]);
|
tmpBr = Br(sponge, regs[0]);
|
||||||
uint32_t tmp4 = tmp3 + S0(tmpBr);
|
uint32_t tmp4 = tmp3 + S0(tmpBr);
|
||||||
tmpBr = Br(sponge, tmp2);
|
tmpBr = Br(sponge, tmp2);
|
||||||
|
|
||||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||||
regs[0] = tmp2 + tmp4;
|
regs[0] = tmp2 + tmp4;
|
||||||
regs[4] += tmpBr;
|
regs[4] += tmpBr;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Die Hash-Funktion
|
// Die Hash-Funktion
|
||||||
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
|
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
|
||||||
{
|
{
|
||||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
#if USE_SHARED
|
||||||
if (thread < threads)
|
extern __shared__ char heftytab[];
|
||||||
{
|
if(threadIdx.x < 64)
|
||||||
// bestimme den aktuellen Zähler
|
{
|
||||||
uint32_t nounce = startNounce + thread;
|
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
|
||||||
|
}
|
||||||
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
|
||||||
#if USE_SHARED
|
__syncthreads();
|
||||||
extern __shared__ unsigned char s[];
|
|
||||||
uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
|
|
||||||
#else
|
|
||||||
// reduktion von 256 byte auf 128 byte
|
|
||||||
uint32_t W1[16];
|
|
||||||
uint32_t W2[16];
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Initialisiere die register a bis h mit der Hash-Tabelle
|
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
uint32_t regs[8];
|
if (thread < threads)
|
||||||
uint32_t hash[8];
|
{
|
||||||
uint32_t sponge[4];
|
// bestimme den aktuellen Zähler
|
||||||
|
uint32_t nounce = startNounce + thread;
|
||||||
|
|
||||||
|
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
||||||
|
// reduktion von 256 byte auf 128 byte
|
||||||
|
uint32_t W1[16];
|
||||||
|
uint32_t W2[16];
|
||||||
|
|
||||||
|
// Initialisiere die register a bis h mit der Hash-Tabelle
|
||||||
|
uint32_t regs[8];
|
||||||
|
uint32_t hash[8];
|
||||||
|
uint32_t sponge[4];
|
||||||
|
|
||||||
#pragma unroll 4
|
#pragma unroll 4
|
||||||
for(int k=0; k < 4; k++)
|
for(int k=0; k < 4; k++)
|
||||||
sponge[k] = hefty_gpu_sponge[k];
|
sponge[k] = hefty_gpu_sponge[k];
|
||||||
|
|
||||||
// pre
|
// pre
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for (int k=0; k < 8; k++)
|
for (int k=0; k < 8; k++)
|
||||||
{
|
{
|
||||||
regs[k] = hefty_gpu_register[k];
|
regs[k] = hefty_gpu_register[k];
|
||||||
hash[k] = regs[k];
|
hash[k] = regs[k];
|
||||||
}
|
}
|
||||||
|
|
||||||
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
|
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int k=0;k<16;k++)
|
for(int k=0;k<16;k++)
|
||||||
W1[k] = hefty_gpu_blockHeader[k];
|
W1[k] = hefty_gpu_blockHeader[k];
|
||||||
W1[3] = SWAB32(nounce);
|
W1[3] = SWAB32(nounce);
|
||||||
|
|
||||||
|
// 2. Runde
|
||||||
// 2. Runde
|
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int j=0;j<16;j++)
|
for(int j=0;j<16;j++)
|
||||||
Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]);
|
Absorb(sponge, W1[j] ^ heftyLookUp(j));
|
||||||
|
|
||||||
// Progress W1 (Bytes 0...63)
|
// Progress W1 (Bytes 0...63)
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int j=0;j<16;j++)
|
for(int j=0;j<16;j++)
|
||||||
{
|
{
|
||||||
Absorb(sponge, regs[3] ^ regs[7]);
|
Absorb(sponge, regs[3] ^ regs[7]);
|
||||||
hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge);
|
hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
|
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
|
||||||
|
|
||||||
#pragma unroll 3
|
#pragma unroll 3
|
||||||
for(int k=0;k<3;k++)
|
for(int k=0;k<3;k++)
|
||||||
{
|
{
|
||||||
#pragma unroll 2
|
#pragma unroll 2
|
||||||
for(int j=0;j<2;j++)
|
for(int j=0;j<2;j++)
|
||||||
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||||
#pragma unroll 5
|
#pragma unroll 5
|
||||||
for(int j=2;j<7;j++)
|
for(int j=2;j<7;j++)
|
||||||
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(int j=7;j<15;j++)
|
for(int j=7;j<15;j++)
|
||||||
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
|
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
|
||||||
|
|
||||||
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
|
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
|
||||||
|
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int j=0;j<16;j++)
|
for(int j=0;j<16;j++)
|
||||||
{
|
{
|
||||||
Absorb(sponge, regs[3] + regs[7]);
|
Absorb(sponge, regs[3] + regs[7]);
|
||||||
hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge);
|
hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
|
||||||
}
|
}
|
||||||
#pragma unroll 16
|
#pragma unroll 16
|
||||||
for(int j=0;j<16;j++)
|
for(int j=0;j<16;j++)
|
||||||
W1[j] = W2[j];
|
W1[j] = W2[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#pragma unroll 8
|
||||||
|
for(int k=0;k<8;k++)
|
||||||
|
hash[k] += regs[k];
|
||||||
|
|
||||||
#pragma unroll 8
|
#pragma unroll 8
|
||||||
for(int k=0;k<8;k++)
|
for(int k=0;k<8;k++)
|
||||||
hash[k] += regs[k];
|
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
|
||||||
|
}
|
||||||
#pragma unroll 8
|
|
||||||
for(int k=0;k<8;k++)
|
|
||||||
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Setup-Funktionen
|
// Setup-Funktionen
|
||||||
__host__ void hefty_cpu_init(int thr_id, int threads)
|
__host__ void hefty_cpu_init(int thr_id, int threads)
|
||||||
{
|
{
|
||||||
cudaSetDevice(thr_id);
|
cudaSetDevice(device_map[thr_id]);
|
||||||
|
|
||||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
cudaGetDeviceProperties(&props, device_map[thr_id]);
|
||||||
cudaMemcpyToSymbol( hefty_gpu_constantTable,
|
|
||||||
hefty_cpu_constantTable,
|
|
||||||
sizeof(uint32_t) * 64 );
|
|
||||||
|
|
||||||
// Speicher für alle Hefty1 hashes belegen
|
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||||
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
|
cudaMemcpyToSymbol( hefty_gpu_constantTable,
|
||||||
|
hefty_cpu_constantTable,
|
||||||
|
sizeof(uint32_t) * 64 );
|
||||||
|
|
||||||
|
// Speicher für alle Hefty1 hashes belegen
|
||||||
|
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
|
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
|
||||||
// data muss 84-Byte haben!
|
// data muss 84-Byte haben!
|
||||||
{
|
{
|
||||||
// Nachricht expandieren und setzen
|
// Nachricht expandieren und setzen
|
||||||
uint32_t msgBlock[32];
|
uint32_t msgBlock[32];
|
||||||
|
|
||||||
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
||||||
memcpy(&msgBlock[0], data, 84);
|
memcpy(&msgBlock[0], data, 84);
|
||||||
msgBlock[21] |= 0x80;
|
msgBlock[21] |= 0x80;
|
||||||
msgBlock[31] = 672; // bitlen
|
msgBlock[31] = 672; // bitlen
|
||||||
|
|
||||||
for(int i=0;i<31;i++) // Byteorder drehen
|
for(int i=0;i<31;i++) // Byteorder drehen
|
||||||
msgBlock[i] = SWAB32(msgBlock[i]);
|
msgBlock[i] = SWAB32(msgBlock[i]);
|
||||||
|
|
||||||
// die erste Runde wird auf der CPU durchgeführt, da diese für
|
// die erste Runde wird auf der CPU durchgeführt, da diese für
|
||||||
// alle Threads gleich ist. Der Hash wird dann an die Threads
|
// alle Threads gleich ist. Der Hash wird dann an die Threads
|
||||||
// übergeben
|
// übergeben
|
||||||
|
|
||||||
// Erstelle expandierten Block W
|
// Erstelle expandierten Block W
|
||||||
uint32_t W[64];
|
uint32_t W[64];
|
||||||
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
|
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
|
||||||
for(int j=16;j<64;j++)
|
for(int j=16;j<64;j++)
|
||||||
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
||||||
|
|
||||||
// Initialisiere die register a bis h mit der Hash-Tabelle
|
// Initialisiere die register a bis h mit der Hash-Tabelle
|
||||||
uint32_t regs[8];
|
uint32_t regs[8];
|
||||||
uint32_t hash[8];
|
uint32_t hash[8];
|
||||||
uint32_t sponge[4];
|
uint32_t sponge[4];
|
||||||
|
|
||||||
// pre
|
// pre
|
||||||
memset(sponge, 0, sizeof(uint32_t) * 4);
|
memset(sponge, 0, sizeof(uint32_t) * 4);
|
||||||
for (int k=0; k < 8; k++)
|
for (int k=0; k < 8; k++)
|
||||||
{
|
{
|
||||||
regs[k] = hefty_cpu_hashTable[k];
|
regs[k] = hefty_cpu_hashTable[k];
|
||||||
hash[k] = regs[k];
|
hash[k] = regs[k];
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1. Runde
|
// 1. Runde
|
||||||
for(int j=0;j<16;j++)
|
for(int j=0;j<16;j++)
|
||||||
Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
|
Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
|
||||||
|
|
||||||
for(int j=0;j<16;j++)
|
for(int j=0;j<16;j++)
|
||||||
{
|
{
|
||||||
Absorb(sponge, regs[3] ^ regs[7]);
|
Absorb(sponge, regs[3] ^ regs[7]);
|
||||||
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
|
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int j=16;j<64;j++)
|
for(int j=16;j<64;j++)
|
||||||
{
|
{
|
||||||
Absorb(sponge, regs[3] + regs[7]);
|
Absorb(sponge, regs[3] + regs[7]);
|
||||||
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
|
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(int k=0;k<8;k++)
|
for(int k=0;k<8;k++)
|
||||||
hash[k] += regs[k];
|
hash[k] += regs[k];
|
||||||
|
|
||||||
// sponge speichern
|
// sponge speichern
|
||||||
|
|
||||||
cudaMemcpyToSymbol( hefty_gpu_sponge,
|
cudaMemcpyToSymbol( hefty_gpu_sponge,
|
||||||
sponge,
|
sponge,
|
||||||
sizeof(uint32_t) * 4 );
|
sizeof(uint32_t) * 4 );
|
||||||
// hash speichern
|
// hash speichern
|
||||||
cudaMemcpyToSymbol( hefty_gpu_register,
|
cudaMemcpyToSymbol( hefty_gpu_register,
|
||||||
hash,
|
hash,
|
||||||
sizeof(uint32_t) * 8 );
|
sizeof(uint32_t) * 8 );
|
||||||
|
|
||||||
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
|
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
|
||||||
cudaMemcpyToSymbol( hefty_gpu_blockHeader,
|
cudaMemcpyToSymbol( hefty_gpu_blockHeader,
|
||||||
&msgBlock[16],
|
&msgBlock[16],
|
||||||
64);
|
64);
|
||||||
}
|
}
|
||||||
|
|
||||||
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
|
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
|
||||||
{
|
{
|
||||||
const int threadsperblock = 128;
|
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
|
||||||
|
// alle anderen mit 512 Threads.
|
||||||
|
int threadsperblock = (props.major >= 3) ? 768 : 512;
|
||||||
|
|
||||||
// berechne wie viele Thread Blocks wir brauchen
|
// berechne wie viele Thread Blocks wir brauchen
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
#if USE_SHARED
|
#if USE_SHARED
|
||||||
size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
|
size_t shared_size = 8 * 64 * sizeof(uint32_t);
|
||||||
#else
|
#else
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
|
||||||
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
|
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
|
||||||
}
|
}
|
||||||
|
@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
|||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
@ -5,8 +5,6 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <memory.h>
|
#include <memory.h>
|
||||||
|
|
||||||
#define W_ALIGNMENT 65
|
|
||||||
|
|
||||||
// Folgende Definitionen später durch header ersetzen
|
// Folgende Definitionen später durch header ersetzen
|
||||||
typedef unsigned int uint32_t;
|
typedef unsigned int uint32_t;
|
||||||
|
|
||||||
@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
|
|||||||
nonceVector[thread] = nounce;
|
nonceVector[thread] = nounce;
|
||||||
|
|
||||||
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
||||||
//extern __shared__ unsigned char s[];
|
|
||||||
//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
|
|
||||||
uint32_t W1[16];
|
uint32_t W1[16];
|
||||||
uint32_t W2[16];
|
uint32_t W2[16];
|
||||||
|
|
||||||
@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
|
|||||||
|
|
||||||
__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
|
__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
|
||||||
{
|
{
|
||||||
const int threadsperblock = 128;
|
const int threadsperblock = 256;
|
||||||
|
|
||||||
// berechne wie viele Thread Blocks wir brauchen
|
// berechne wie viele Thread Blocks wir brauchen
|
||||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||||
dim3 block(threadsperblock);
|
dim3 block(threadsperblock);
|
||||||
|
|
||||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
// Größe des dynamischen Shared Memory Bereichs
|
||||||
//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
|
|
||||||
size_t shared_size = 0;
|
size_t shared_size = 0;
|
||||||
|
|
||||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||||
|
35
heavy.cu
35
heavy.cu
@ -163,6 +163,41 @@ extern "C" int cuda_num_devices()
|
|||||||
return GPU_N;
|
return GPU_N;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool substringsearch(const char *haystack, const char *needle, int &match)
|
||||||
|
{
|
||||||
|
int hlen = strlen(haystack);
|
||||||
|
int nlen = strlen(needle);
|
||||||
|
for (int i=0; i < hlen; ++i)
|
||||||
|
{
|
||||||
|
if (haystack[i] == ' ') continue;
|
||||||
|
int j=0, x = 0;
|
||||||
|
while(j < nlen)
|
||||||
|
{
|
||||||
|
if (haystack[i+x] == ' ') {++x; continue;}
|
||||||
|
if (needle[j] == ' ') {++j; continue;}
|
||||||
|
if (needle[j] == '#') return ++match == needle[j+1]-'0';
|
||||||
|
if (tolower(haystack[i+x]) != tolower(needle[j])) break;
|
||||||
|
++j; ++x;
|
||||||
|
}
|
||||||
|
if (j == nlen) return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
|
||||||
|
extern "C" int cuda_finddevice(char *name)
|
||||||
|
{
|
||||||
|
int num = cuda_num_devices();
|
||||||
|
int match = 0;
|
||||||
|
for (int i=0; i < num; ++i)
|
||||||
|
{
|
||||||
|
cudaDeviceProp props;
|
||||||
|
if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
|
||||||
|
if (substringsearch(props.name, name, match)) return i;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
|
// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
|
||||||
typedef struct { double value[8]; } tsumarray;
|
typedef struct { double value[8]; } tsumarray;
|
||||||
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
|
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
|
||||||
|
Loading…
Reference in New Issue
Block a user