update to version v0.5 (2014-03-27)

This commit is contained in:
Christian Buchner 2014-03-27 00:47:12 +01:00
parent 1bb78f0258
commit 2ca6ede92b
17 changed files with 566 additions and 443 deletions

View File

@ -34,4 +34,4 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
# we're now targeting all major compute architectures within one binary. # we're now targeting all major compute architectures within one binary.
.cu.o: .cu.o:
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<

View File

@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS
# we're now targeting all major compute architectures within one binary. # we're now targeting all major compute architectures within one binary.
.cu.o: .cu.o:
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# Tell versions [3.59,3.63) of GNU make to not export all variables. # Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded. # Otherwise a system limit (for SysV at least) may be exceeded.

View File

@ -1,5 +1,5 @@
ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization"
------------------------------------------------------------- -------------------------------------------------------------
*************************************************************** ***************************************************************
@ -38,6 +38,11 @@ its command line interface and options.
fugue256 use to mine Fuguecoin fugue256 use to mine Fuguecoin
groestl use to mine Groestlcoin groestl use to mine Groestlcoin
-d, --devices gives a comma separated list of CUDA device IDs
to operate on. Device IDs start counting from 0!
Alternatively give string names of your card like
gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ") -o, --url=URL URL of mining server (default: " DEF_RPC_URL ")
-O, --userpass=U:P username:password pair for mining server -O, --userpass=U:P username:password pair for mining server
-u, --user=USERNAME username for mining server -u, --user=USERNAME username for mining server
@ -63,8 +68,10 @@ its command line interface and options.
-V, --version display version information and exit -V, --version display version information and exit
-h, --help display this help text and exit -h, --help display this help text and exit
>>> Examples <<< >>> Examples <<<
Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512 ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
@ -107,22 +114,33 @@ from your old clunkers.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
March, 24 2014 fixed Groestl pool support March, 27 2014 Heavycoin exchange rates soar, and as a result this coin
gets some love: We greatly optimized the Hefty1 kernel
for speed. Expect some hefty gains, especially on 750Ti's!
went back to Compute 1.x for cuda_hefty1.cu kernel by By popular demand, we added the -d option as known from
default after numerous reports of ccminer v0.2/v0.3 cudaminer.
not working with HeavyCoin for some people.
March, 23 2014 added Groestlcoin support. stratum status unknown different compute capability builds are now provided until
(the only pool is currently down for fixing issues) we figure out how to pack everything into a single executable
in a Windows build.
March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates March, 24 2014 fixed Groestl pool support
on Fermi and Maxwell devices. Kepler may suffer slightly
(3-5%)
Fixed Stratum for Fuguecoin. Tested on dwarfpool. went back to Compute 1.x for cuda_hefty1.cu kernel by
default after numerous reports of ccminer v0.2/v0.3
not working with HeavyCoin for some people.
March, 18 2014 initial release. March, 23 2014 added Groestlcoin support. stratum status unknown
(the only pool is currently down for fixing issues)
March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
on Fermi and Maxwell devices. Kepler may suffer slightly
(3-5%)
Fixed Stratum for Fuguecoin. Tested on dwarfpool.
March, 18 2014 initial release.
>>> AUTHORS <<< >>> AUTHORS <<<

View File

@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX> <CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile> </CudaCompile>
<CudaCompile> <CudaCompile>
<MaxRegCount>63</MaxRegCount> <MaxRegCount>80</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile> <CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep> <Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration> <CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include> <Include>
</Include> </Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX> <CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile> </CudaCompile>
<CudaCompile> <CudaCompile>
<MaxRegCount>63</MaxRegCount> <MaxRegCount>80</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile> <CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep> <Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration> <CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include> <Include>
</Include> </Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX> <CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile> </CudaCompile>
<CudaCompile> <CudaCompile>
<MaxRegCount>63</MaxRegCount> <MaxRegCount>80</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile> <CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep> <Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration> <CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include> <Include>
</Include> </Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX> <CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile> </CudaCompile>
<CudaCompile> <CudaCompile>
<MaxRegCount>63</MaxRegCount> <MaxRegCount>80</MaxRegCount>
</CudaCompile> </CudaCompile>
<CudaCompile> <CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV> <PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep> <Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration> <CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include> <Include>
</Include> </Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CudaCompile Include="cuda_fugue256.cu" /> <CudaCompile Include="cuda_fugue256.cu" />
<CudaCompile Include="cuda_groestl512.cu" /> <CudaCompile Include="cuda_groestl512.cu" />
<CudaCompile Include="cuda_groestlcoin.cu" /> <CudaCompile Include="cuda_groestlcoin.cu" />
<CudaCompile Include="cuda_hefty1.cu"> <CudaCompile Include="cuda_hefty1.cu" />
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">124</MaxRegCount>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">124</MaxRegCount>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">124</MaxRegCount>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">124</MaxRegCount>
</CudaCompile>
<CudaCompile Include="cuda_keccak512.cu" /> <CudaCompile Include="cuda_keccak512.cu" />
<CudaCompile Include="cuda_sha256.cu" /> <CudaCompile Include="cuda_sha256.cu" />
<CudaCompile Include="heavy.cu" /> <CudaCompile Include="heavy.cu" />

20
configure vendored
View File

@ -1,6 +1,6 @@
#! /bin/sh #! /bin/sh
# Guess values for system-dependent variables and create Makefiles. # Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24. # Generated by GNU Autoconf 2.68 for ccminer 2014.03.27.
# #
# #
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@ -557,8 +557,8 @@ MAKEFLAGS=
# Identity of this package. # Identity of this package.
PACKAGE_NAME='ccminer' PACKAGE_NAME='ccminer'
PACKAGE_TARNAME='ccminer' PACKAGE_TARNAME='ccminer'
PACKAGE_VERSION='2014.03.24' PACKAGE_VERSION='2014.03.27'
PACKAGE_STRING='ccminer 2014.03.24' PACKAGE_STRING='ccminer 2014.03.27'
PACKAGE_BUGREPORT='' PACKAGE_BUGREPORT=''
PACKAGE_URL='' PACKAGE_URL=''
@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing. # Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh. # This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF cat <<_ACEOF
\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems. \`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]... Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1368,7 +1368,7 @@ fi
if test -n "$ac_init_help"; then if test -n "$ac_init_help"; then
case $ac_init_help in case $ac_init_help in
short | recursive ) echo "Configuration of ccminer 2014.03.24:";; short | recursive ) echo "Configuration of ccminer 2014.03.27:";;
esac esac
cat <<\_ACEOF cat <<\_ACEOF
@ -1469,7 +1469,7 @@ fi
test -n "$ac_init_help" && exit $ac_status test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then if $ac_init_version; then
cat <<\_ACEOF cat <<\_ACEOF
ccminer configure 2014.03.24 ccminer configure 2014.03.27
generated by GNU Autoconf 2.68 generated by GNU Autoconf 2.68
Copyright (C) 2010 Free Software Foundation, Inc. Copyright (C) 2010 Free Software Foundation, Inc.
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake. running configure, to aid debugging if configure makes a mistake.
It was created by ccminer $as_me 2014.03.24, which was It was created by ccminer $as_me 2014.03.27, which was
generated by GNU Autoconf 2.68. Invocation command line was generated by GNU Autoconf 2.68. Invocation command line was
$ $0 $@ $ $0 $@
@ -2901,7 +2901,7 @@ fi
# Define the identity of the package. # Define the identity of the package.
PACKAGE='ccminer' PACKAGE='ccminer'
VERSION='2014.03.24' VERSION='2014.03.27'
cat >>confdefs.h <<_ACEOF cat >>confdefs.h <<_ACEOF
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their # report actual input values of CONFIG_FILES etc. instead of their
# values after options handling. # values after options handling.
ac_log=" ac_log="
This file was extended by ccminer $as_me 2014.03.24, which was This file was extended by ccminer $as_me 2014.03.27, which was
generated by GNU Autoconf 2.68. Invocation command line was generated by GNU Autoconf 2.68. Invocation command line was
CONFIG_FILES = $CONFIG_FILES CONFIG_FILES = $CONFIG_FILES
@ -7184,7 +7184,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\ ac_cs_version="\\
ccminer config.status 2014.03.24 ccminer config.status 2014.03.27
configured by $0, generated by GNU Autoconf 2.68, configured by $0, generated by GNU Autoconf 2.68,
with options \\"\$ac_cs_config\\" with options \\"\$ac_cs_config\\"

View File

@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.03.24]) AC_INIT([ccminer], [2014.03.27])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@ -51,8 +51,13 @@
// from heavy.cu // from heavy.cu
#ifdef __cplusplus #ifdef __cplusplus
extern "C" extern "C"
{
#endif #endif
int cuda_num_devices(); int cuda_num_devices();
int cuda_finddevice(char *name);
#ifdef __cplusplus
}
#endif
#ifdef __linux /* Linux specific policy and affinity management */ #ifdef __linux /* Linux specific policy and affinity management */
@ -144,10 +149,11 @@ static int opt_scantime = 5;
static json_t *opt_config; static json_t *opt_config;
static const bool opt_time = true; static const bool opt_time = true;
static sha256_algos opt_algo = ALGO_HEAVY; static sha256_algos opt_algo = ALGO_HEAVY;
static int opt_n_threads; static int opt_n_threads = 0;
bool opt_trust_pool = false; bool opt_trust_pool = false;
uint16_t opt_vote = 9999; uint16_t opt_vote = 9999;
static int num_processors; static int num_processors;
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
static char *rpc_url; static char *rpc_url;
static char *rpc_userpass; static char *rpc_userpass;
static char *rpc_user, *rpc_pass; static char *rpc_user, *rpc_pass;
@ -185,7 +191,11 @@ Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\ -a, --algo=ALGO specify the algorithm to use\n\
fugue256 Fuguecoin hash\n\ fugue256 Fuguecoin hash\n\
heavy Heavycoin hash\n\ heavy Heavycoin hash\n\
-v, --vote=VOTE block reward vote\n\ -d, --devices takes a comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\
string names of your cards like gtx780ti or gt640#2\n\
(matching 2nd gt640 in the PC)\n\
-v, --vote=VOTE block reward vote (for HeavyCoin)\n\
-m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ -m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
-o, --url=URL URL of mining server\n\ -o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\
@ -227,7 +237,7 @@ static char const short_options[] =
#ifdef HAVE_SYSLOG_H #ifdef HAVE_SYSLOG_H
"S" "S"
#endif #endif
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:"; "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
static struct option const options[] = { static struct option const options[] = {
{ "algo", 1, NULL, 'a' }, { "algo", 1, NULL, 'a' },
@ -259,6 +269,7 @@ static struct option const options[] = {
{ "user", 1, NULL, 'u' }, { "user", 1, NULL, 'u' },
{ "userpass", 1, NULL, 'O' }, { "userpass", 1, NULL, 'O' },
{ "version", 0, NULL, 'V' }, { "version", 0, NULL, 'V' },
{ "devices", 1, NULL, 'd' },
{ 0, 0, 0, 0 } { 0, 0, 0, 0 }
}; };
@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg)
case 'S': case 'S':
use_syslog = true; use_syslog = true;
break; break;
case 'd': // CB
{
char * pch = strtok (arg,",");
opt_n_threads = 0;
while (pch != NULL) {
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
{
if (atoi(pch) < num_processors)
device_map[opt_n_threads++] = atoi(pch);
else {
applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
exit(1);
}
} else {
int device = cuda_finddevice(pch);
if (device >= 0 && device < num_processors)
device_map[opt_n_threads++] = device;
else {
applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
exit(1);
}
}
pch = strtok (NULL, ",");
}
}
break;
case 'V': case 'V':
show_version_and_exit(); show_version_and_exit();
case 'h': case 'h':
@ -1346,7 +1383,7 @@ static void signal_handler(int sig)
} }
#endif #endif
#define PROGRAM_VERSION "0.4" #define PROGRAM_VERSION "0.5"
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
struct thr_info *thr; struct thr_info *thr;
@ -1370,6 +1407,9 @@ int main(int argc, char *argv[])
rpc_user = strdup(""); rpc_user = strdup("");
rpc_pass = strdup(""); rpc_pass = strdup("");
pthread_mutex_init(&applog_lock, NULL);
num_processors = cuda_num_devices();
/* parse command line */ /* parse command line */
parse_cmdline(argc, argv); parse_cmdline(argc, argv);
@ -1385,7 +1425,6 @@ int main(int argc, char *argv[])
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
} }
pthread_mutex_init(&applog_lock, NULL);
pthread_mutex_init(&stats_lock, NULL); pthread_mutex_init(&stats_lock, NULL);
pthread_mutex_init(&g_work_lock, NULL); pthread_mutex_init(&g_work_lock, NULL);
pthread_mutex_init(&stratum.sock_lock, NULL); pthread_mutex_init(&stratum.sock_lock, NULL);
@ -1416,7 +1455,6 @@ int main(int argc, char *argv[])
} }
#endif #endif
num_processors = cuda_num_devices();
if (num_processors == 0) if (num_processors == 0)
{ {
applog(LOG_ERR, "No CUDA devices found! terminating."); applog(LOG_ERR, "No CUDA devices found! terminating.");

View File

@ -152,7 +152,7 @@
#define PACKAGE_NAME "ccminer" #define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */ /* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.03.24" #define PACKAGE_STRING "ccminer 2014.03.27"
/* Define to the one symbol short name of this package. */ /* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME #undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
#undef PACKAGE_URL #undef PACKAGE_URL
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "2014.03.24" #define PACKAGE_VERSION "2014.03.27"
/* If using the C implementation of alloca, define if you know the /* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be direction of stack growth for your system; otherwise it will be

View File

@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata)
__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
{ {
const int threadsperblock = 128; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -9,7 +9,10 @@
#define USE_SHARED 1 #define USE_SHARED 1
// heavy.cu // aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
void fugue256_cpu_init(int thr_id, int threads) void fugue256_cpu_init(int thr_id, int threads)
{ {
cudaSetDevice(thr_id); cudaSetDevice(device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256); texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) // Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED #if USE_SHARED
size_t shared_size = 4 * 256 * sizeof(uint32_t); size_t shared_size = 4 * 256 * sizeof(uint32_t);
#else #else

View File

@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -9,6 +9,10 @@
#define USE_SHARED 1 #define USE_SHARED 1
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
@ -20,13 +24,7 @@ typedef unsigned long long uint64_t;
__constant__ uint32_t pTarget[8]; // Single GPU __constant__ uint32_t pTarget[8]; // Single GPU
extern uint32_t *d_resultNonce[8]; extern uint32_t *d_resultNonce[8];
// globaler Speicher für unsere Ergebnisse
uint32_t *d_hashGROESTLCOINoutput[8];
__constant__ uint32_t groestlcoin_gpu_state[32];
__constant__ uint32_t groestlcoin_gpu_msg[32]; __constant__ uint32_t groestlcoin_gpu_msg[32];
__constant__ uint32_t sha256coin_gpu_constantTable[64];
__constant__ uint32_t sha256coin_gpu_register[8];
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[]; extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[]; extern uint32_t T3dn_cpu[];
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#else
// Kepler (Compute 3.5)
#define S(x, n) __funnelshift_r( x, x, n );
#endif
#define R(x, n) ((x) >> (n)) #define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z) #define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z)) #define Maj(x, y, z) ((x & (y | z)) | (y & z))
@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[];
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) #define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) __device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
{ {
uint32_t t[32]; uint32_t t[32];
//#pragma unroll 14 //#pragma unroll 14
for(int r=0;r<14;r++) for(int r=0;r<14;r++)
{ {
#pragma unroll 16 switch(r)
for(int k=0;k<16;k++)
{ {
a[(k*2)+0] ^= PC32up(k * 0x10, r); case 0:
//a[(k<<1)+1] ^= PC32dn(k * 0x10, r); #pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
} }
// RBTT // RBTT
@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
} }
} }
__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) __device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
{ {
//#pragma unroll 14 //#pragma unroll 14
for(int r=0;r<14;r++) for(int r=0;r<14;r++)
{ {
uint32_t t[32]; uint32_t t[32];
#pragma unroll 16 switch(r)
for(int k=0;k<16;k++)
{ {
a[(k*2)+0] ^= QC32up(k * 0x10, r); case 0:
a[(k*2)+1] ^= QC32dn(k * 0x10, r); #pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
} }
// RBTT // RBTT
@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
} }
} }
#if USE_SHARED #if USE_SHARED
__global__ void __launch_bounds__(256) __global__ void /* __launch_bounds__(256) */
#else #else
__global__ void __global__ void
#endif #endif
groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
{ {
#if USE_SHARED #if USE_SHARED
extern __shared__ char mixtabs[]; extern __shared__ char mixtabs[];
@ -204,146 +286,111 @@ __global__ void
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
/////
///// Lieber groestl, mach, dass es abgeht!!!
/////
// GROESTL // GROESTL
uint32_t message[32]; uint32_t message[32];
uint32_t state[32]; uint32_t state[32];
uint32_t g[32];
#pragma unroll 32 #pragma unroll 32
for(int k=0;k<32;k++) for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
{
// TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem
// constant Memory zu lesen. Das ist Verschwendung von Bandbreite.
state[k] = groestlcoin_gpu_state[k];
message[k] = groestlcoin_gpu_msg[k];
}
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
message[19] = SWAB32(nounce); message[19] = SWAB32(nounce);
#pragma unroll 32 #pragma unroll 32
for(int u=0;u<32;u++) for(int u=0;u<32;u++) state[u] = message[u];
g[u] = message[u] ^ state[u]; // TODO: state ist fast ueberall 0. state[31] ^= 0x20000;
// Perm // Perm
#if USE_SHARED #if USE_SHARED
groestlcoin_perm_P(g, mixtabs); // TODO: g[] entspricht fast genau message[] groestlcoin_perm_P(state, mixtabs);
groestlcoin_perm_Q(message, mixtabs); // kann man das ausnutzen? state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else #else
groestlcoin_perm_P(g, NULL); groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL); groestlcoin_perm_Q(message, NULL);
#endif #endif
#pragma unroll 32 #pragma unroll 32
for(int u=0;u<32;u++) for(int u=0;u<32;u++) state[u] ^= message[u];
{
// TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion #pragma unroll 32
// was den Registerbedarf senken koennte? for(int u=0;u<32;u++) message[u] = state[u];
state[u] ^= g[u] ^ message[u];
g[u] = state[u];
}
#if USE_SHARED #if USE_SHARED
groestlcoin_perm_P(g, mixtabs); groestlcoin_perm_P(message, mixtabs);
#else #else
groestlcoin_perm_P(g, NULL); groestlcoin_perm_P(message, NULL);
#endif #endif
#pragma unroll 32 #pragma unroll 32
for(int u=0;u<32;u++) for(int u=0;u<32;u++) state[u] ^= message[u];
state[u] ^= g[u];
//// ////
//// 2. Runde groestl //// 2. Runde groestl
//// ////
#pragma unroll 16 #pragma unroll 16
for(int k=0;k<16;k++) for(int k=0;k<16;k++) message[k] = state[k + 16];
message[k] = state[k + 16]; #pragma unroll 14
for(int k=1;k<15;k++)
#pragma unroll 32
for(int k=0;k<32;k++)
state[k] = groestlcoin_gpu_state[k];
#pragma unroll 16
for(int k=0;k<16;k++)
message[k+16] = 0; message[k+16] = 0;
message[16] = 0x80; message[16] = 0x80;
message[31] = 0x01000000; message[31] = 0x01000000;
#pragma unroll 32 #pragma unroll 32
for(int u=0;u<32;u++) for(int u=0;u<32;u++)
g[u] = message[u] ^ state[u]; state[u] = message[u];
state[31] ^= 0x20000;
// Perm // Perm
#if USE_SHARED #if USE_SHARED
groestlcoin_perm_P(g, mixtabs); groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs); groestlcoin_perm_Q(message, mixtabs);
#else #else
groestlcoin_perm_P(g, NULL); groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL); groestlcoin_perm_Q(message, NULL);
#endif #endif
#pragma unroll 32 #pragma unroll 32
for(int u=0;u<32;u++) for(int u=0;u<32;u++) state[u] ^= message[u];
{
state[u] ^= g[u] ^ message[u]; #pragma unroll 32
g[u] = state[u]; for(int u=0;u<32;u++) message[u] = state[u];
}
#if USE_SHARED #if USE_SHARED
groestlcoin_perm_P(g, mixtabs); groestlcoin_perm_P(message, mixtabs);
#else #else
groestlcoin_perm_P(g, NULL); groestlcoin_perm_P(message, NULL);
#endif #endif
#pragma unroll 32 #pragma unroll 32
for(int u=0;u<32;u++) for(int u=0;u<32;u++) state[u] ^= message[u];
state[u] ^= g[u];
/*
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] = state[k+16];
*/
// kopiere Ergebnis // kopiere Ergebnis
/* int i, position = -1;
#pragma unroll 16
for(int k=0;k<16;k++)
((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
*/
int i;
bool rc = true; bool rc = true;
#pragma unroll 8
for (i = 7; i >= 0; i--) { for (i = 7; i >= 0; i--) {
if (state[i+16] > pTarget[i]) { if (state[i+16] > pTarget[i]) {
rc = false; if(position < i) {
break; position = i;
} rc = false;
if (state[i+16] < pTarget[i]) { }
rc = true; }
break; if (state[i+16] < pTarget[i]) {
} if(position < i) {
position = i;
rc = true;
}
}
} }
if(rc == true) if(rc == true)
{
if(resNounce[0] > nounce) if(resNounce[0] > nounce)
{
resNounce[0] = nounce; resNounce[0] = nounce;
/*
#pragma unroll 8
for(int k=0;k<8;k++)
((uint32_t*)outputHash)[k] = (hash[k]);
*/
}
}
} }
} }
@ -360,7 +407,7 @@ __global__ void
// Setup-Funktionen // Setup-Funktionen
__host__ void groestlcoin_cpu_init(int thr_id, int threads) __host__ void groestlcoin_cpu_init(int thr_id, int threads)
{ {
cudaSetDevice(thr_id); cudaSetDevice(device_map[thr_id]);
cudaDeviceSetCacheConfig( cudaFuncCachePreferShared ); cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
// Texturen mit obigem Makro initialisieren // Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256); texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256); texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256); texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
// setze register // Speicher für Gewinner-Nonce belegen
// TODO: fast vollstaendige Vorbelegung mit Nullen.
// da besteht doch Optimierungspotenzial im GPU Kernel
// denn mit Nullen braucht man nicht wirklich rechnen.
uint32_t groestl_state_init[32];
memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
groestl_state_init[31] = 0x20000;
// state speichern
cudaMemcpyToSymbol( groestlcoin_gpu_state,
groestl_state_init,
128);
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
// Speicher für alle Ergebnisse belegen (nur für Debug)
cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
} }
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn) __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) // Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED #if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t); size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else #else
@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id); //fprintf(stderr, "ThrID: %d\n", thr_id);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]); groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id); MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
/// Debug
//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
// Nounce
//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
} }

View File

@ -2,26 +2,40 @@
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
// aus cpu-miner.c
extern int device_map[8];
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#define USE_SHARED 1
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t; typedef unsigned int uint32_t;
typedef unsigned char uint8_t; typedef unsigned char uint8_t;
typedef unsigned short uint16_t; typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props;
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
uint32_t *d_heftyHashes[8]; uint32_t *d_heftyHashes[8];
/* Hash-Tabellen */ /* Hash-Tabellen */
__constant__ uint32_t hefty_gpu_constantTable[64]; __constant__ uint32_t hefty_gpu_constantTable[64];
#if USE_SHARED
#define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
#else
#define heftyLookUp(x) hefty_gpu_constantTable[x]
#endif
// muss expandiert werden // muss expandiert werden
__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message __constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
__constant__ uint32_t hefty_gpu_register[8]; __constant__ uint32_t hefty_gpu_register[8];
__constant__ uint32_t hefty_gpu_sponge[4]; __constant__ uint32_t hefty_gpu_sponge[4];
uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL, uint32_t hefty_cpu_hashTable[] = {
0x6a09e667UL,
0xbb67ae85UL, 0xbb67ae85UL,
0x3c6ef372UL, 0x3c6ef372UL,
0xa54ff53aUL, 0xa54ff53aUL,
@ -29,8 +43,9 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
0x9b05688cUL, 0x9b05688cUL,
0x1f83d9abUL, 0x1f83d9abUL,
0x5be0cd19UL }; 0x5be0cd19UL };
uint32_t hefty_cpu_constantTable[] = { uint32_t hefty_cpu_constantTable[] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
@ -48,350 +63,352 @@ uint32_t hefty_cpu_constantTable[] = {
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
}; };
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) //#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define R(x, n) ((x) >> (n)) static __host__ __device__ uint32_t S(uint32_t x, int n)
#define Ch(x, y, z) ((x & (y ^ z)) ^ z) {
#define Maj(x, y, z) ((x & (y | z)) | (y & z)) return (((x) >> (n)) | ((x) << (32 - (n))));
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) }
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) #define R(x, n) ((x) >> (n))
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) #define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) #define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) #define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
// uint8_t // uint8_t
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) ) #define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x) __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
{ {
uint16_t w = (x >> 16) ^ (x & 0xffff); uint16_t w = (x >> 16) ^ (x & 0xffff);
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) ); uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
return (n >> 2) ^ (n & 0x03); return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
} }
// 4 auf einmal // 4 auf einmal
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F ) #define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
#define getByte(x,y) ( ((x) >> (y)) & 0xFF ) #define getByte(x,y) ( ((x) >> (y)) & 0xFF )
__host__ __device__ void Mangle(uint32_t *inp) __host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
{ {
uint32_t r = smoosh4Quad(inp[0]); uint32_t r = smoosh4Quad(inp[0]);
//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) ); uint32_t inp0org;
//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) ); uint32_t tmp0Mask, tmp1Mask;
//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) ); uint32_t in1, in2, isAddition;
//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) ); uint32_t tmp;
uint8_t b;
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24)); inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
switch (smoosh2(inp[1])) { r += 0x01010101;
case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break; tmp = smoosh2(inp[1]);
case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break; b = getByte(r,tmp);
case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break; inp0org = S(inp[0], b);
case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break; tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
} tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
uint32_t tmp = smoosh2(inp[1] ^ inp[2]); in1 = (inp[2] & ~inp0org) |
switch (tmp) { (tmp1Mask & ~inp[2] & inp0org) |
case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break; (~tmp0Mask & ~inp[2] & inp0org);
case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break; in2 = inp[2] += ~inp0org;
case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break; isAddition = ~tmp0Mask & tmp1Mask;
case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break; inp[2] = isAddition ? in2 : in1;
}
r += 0x01010101;
tmp = smoosh2(inp[1] ^ inp[2]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
inp[0] ^= (inp[1] ^ inp[2]) + inp[3]; in1 = (inp[3] & ~inp0org) |
(tmp1Mask & ~inp[3] & inp0org) |
(~tmp0Mask & ~inp[3] & inp0org);
in2 = inp[3] += ~inp0org;
isAddition = ~tmp0Mask & tmp1Mask;
inp[3] = isAddition ? in2 : in1;
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
} }
__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x) __host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
{ {
inp[0] ^= x; inp[0] ^= x;
Mangle(inp); Mangle(inp);
} }
__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp) __host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
{ {
uint32_t y = inp[0]; uint32_t y = inp[0];
Mangle(inp); Mangle(inp);
return y; return y;
} }
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x) __host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
{ {
uint32_t r = Squeeze(sponge); uint32_t r = Squeeze(sponge);
uint32_t t = ((r >> 8) & 0x1F);
uint32_t y = 1 << t;
//uint8_t r0 = r >> 8; uint32_t a = (((r>>1) & 0x01) << t) & y;
uint8_t r1 = r & 0xFF; uint32_t b = ((r & 0x01) << t) & y;
uint32_t y = 1 << ((r >> 8) & 0x1F); uint32_t c = x & y;
//uint32_t retVal; uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
//retVal = x; return retVal;
uint32_t resArr[4];
resArr[0] = x;
resArr[1] = x & ~y;
resArr[2] = x | y;
resArr[3] = x ^ y;
return resArr[r1 & 0x03];
/*
switch(r1 & 0x03)
{
case 0:
break;
case 1:
retVal = x & ~y;
break;
case 2:
retVal = x | y;
break;
case 3:
retVal = x ^ y;
break;
}
return retVal;
*/
} }
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) __forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{ {
uint32_t tmpBr; uint32_t tmpBr;
uint32_t brG = Br(sponge, regs[6]); uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]); uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]); uint32_t brE = Br(sponge, regs[4]);
uint32_t tmp2 = tmp1 + S1(brE); uint32_t tmp2 = tmp1 + S1(brE);
uint32_t brC = Br(sponge, regs[2]); uint32_t brC = Br(sponge, regs[2]);
uint32_t brB = Br(sponge, regs[1]); uint32_t brB = Br(sponge, regs[1]);
uint32_t brA = Br(sponge, regs[0]); uint32_t brA = Br(sponge, regs[0]);
uint32_t tmp3 = Maj(brA, brB, brC); uint32_t tmp3 = Maj(brA, brB, brC);
tmpBr = Br(sponge, regs[0]); tmpBr = Br(sponge, regs[0]);
uint32_t tmp4 = tmp3 + S0(tmpBr); uint32_t tmp4 = tmp3 + S0(tmpBr);
tmpBr = Br(sponge, tmp2); tmpBr = Br(sponge, tmp2);
#pragma unroll 7 #pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = tmp2 + tmp4; regs[0] = tmp2 + tmp4;
regs[4] += tmpBr; regs[4] += tmpBr;
} }
__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge) __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{ {
uint32_t tmpBr; uint32_t tmpBr;
uint32_t brG = Br(sponge, regs[6]); uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]); uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K; uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]); uint32_t brE = Br(sponge, regs[4]);
uint32_t tmp2 = tmp1 + S1(brE); uint32_t tmp2 = tmp1 + S1(brE);
uint32_t brC = Br(sponge, regs[2]); uint32_t brC = Br(sponge, regs[2]);
uint32_t brB = Br(sponge, regs[1]); uint32_t brB = Br(sponge, regs[1]);
uint32_t brA = Br(sponge, regs[0]); uint32_t brA = Br(sponge, regs[0]);
uint32_t tmp3 = Maj(brA, brB, brC); uint32_t tmp3 = Maj(brA, brB, brC);
tmpBr = Br(sponge, regs[0]); tmpBr = Br(sponge, regs[0]);
uint32_t tmp4 = tmp3 + S0(tmpBr); uint32_t tmp4 = tmp3 + S0(tmpBr);
tmpBr = Br(sponge, tmp2); tmpBr = Br(sponge, tmp2);
for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = tmp2 + tmp4; regs[0] = tmp2 + tmp4;
regs[4] += tmpBr; regs[4] += tmpBr;
} }
// Die Hash-Funktion // Die Hash-Funktion
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash) __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); #if USE_SHARED
if (thread < threads) extern __shared__ char heftytab[];
{ if(threadIdx.x < 64)
// bestimme den aktuellen Zähler {
uint32_t nounce = startNounce + thread; *((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
}
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
#if USE_SHARED __syncthreads();
extern __shared__ unsigned char s[];
uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
#else
// reduktion von 256 byte auf 128 byte
uint32_t W1[16];
uint32_t W2[16];
#endif #endif
// Initialisiere die register a bis h mit der Hash-Tabelle int thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t regs[8]; if (thread < threads)
uint32_t hash[8]; {
uint32_t sponge[4]; // bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
// reduktion von 256 byte auf 128 byte
uint32_t W1[16];
uint32_t W2[16];
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
uint32_t sponge[4];
#pragma unroll 4 #pragma unroll 4
for(int k=0; k < 4; k++) for(int k=0; k < 4; k++)
sponge[k] = hefty_gpu_sponge[k]; sponge[k] = hefty_gpu_sponge[k];
// pre // pre
#pragma unroll 8 #pragma unroll 8
for (int k=0; k < 8; k++) for (int k=0; k < 8; k++)
{ {
regs[k] = hefty_gpu_register[k]; regs[k] = hefty_gpu_register[k];
hash[k] = regs[k]; hash[k] = regs[k];
} }
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding //memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
#pragma unroll 16 #pragma unroll 16
for(int k=0;k<16;k++) for(int k=0;k<16;k++)
W1[k] = hefty_gpu_blockHeader[k]; W1[k] = hefty_gpu_blockHeader[k];
W1[3] = SWAB32(nounce); W1[3] = SWAB32(nounce);
// 2. Runde
// 2. Runde
#pragma unroll 16 #pragma unroll 16
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]); Absorb(sponge, W1[j] ^ heftyLookUp(j));
// Progress W1 (Bytes 0...63) // Progress W1 (Bytes 0...63)
#pragma unroll 16 #pragma unroll 16
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
{ {
Absorb(sponge, regs[3] ^ regs[7]); Absorb(sponge, regs[3] ^ regs[7]);
hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge); hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
} }
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ... // Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
#pragma unroll 3 #pragma unroll 3
for(int k=0;k<3;k++) for(int k=0;k<3;k++)
{ {
#pragma unroll 2 #pragma unroll 2
for(int j=0;j<2;j++) for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 5 #pragma unroll 5
for(int j=2;j<7;j++) for(int j=2;j<7;j++)
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j]; W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 8 #pragma unroll 8
for(int j=7;j<15;j++) for(int j=7;j<15;j++)
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j]; W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
#pragma unroll 16 #pragma unroll 16
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
{ {
Absorb(sponge, regs[3] + regs[7]); Absorb(sponge, regs[3] + regs[7]);
hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge); hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
} }
#pragma unroll 16 #pragma unroll 16
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
W1[j] = W2[j]; W1[j] = W2[j];
} }
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
#pragma unroll 8 #pragma unroll 8
for(int k=0;k<8;k++) for(int k=0;k<8;k++)
hash[k] += regs[k]; ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
}
#pragma unroll 8
for(int k=0;k<8;k++)
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
}
} }
// Setup-Funktionen // Setup-Funktionen
__host__ void hefty_cpu_init(int thr_id, int threads) __host__ void hefty_cpu_init(int thr_id, int threads)
{ {
cudaSetDevice(thr_id); cudaSetDevice(device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher cudaGetDeviceProperties(&props, device_map[thr_id]);
cudaMemcpyToSymbol( hefty_gpu_constantTable,
hefty_cpu_constantTable,
sizeof(uint32_t) * 64 );
// Speicher für alle Hefty1 hashes belegen // Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); cudaMemcpyToSymbol( hefty_gpu_constantTable,
hefty_cpu_constantTable,
sizeof(uint32_t) * 64 );
// Speicher für alle Hefty1 hashes belegen
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
} }
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data) __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
// data muss 84-Byte haben! // data muss 84-Byte haben!
{ {
// Nachricht expandieren und setzen // Nachricht expandieren und setzen
uint32_t msgBlock[32]; uint32_t msgBlock[32];
memset(msgBlock, 0, sizeof(uint32_t) * 32); memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 84); memcpy(&msgBlock[0], data, 84);
msgBlock[21] |= 0x80; msgBlock[21] |= 0x80;
msgBlock[31] = 672; // bitlen msgBlock[31] = 672; // bitlen
for(int i=0;i<31;i++) // Byteorder drehen for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]); msgBlock[i] = SWAB32(msgBlock[i]);
// die erste Runde wird auf der CPU durchgeführt, da diese für // die erste Runde wird auf der CPU durchgeführt, da diese für
// alle Threads gleich ist. Der Hash wird dann an die Threads // alle Threads gleich ist. Der Hash wird dann an die Threads
// übergeben // übergeben
// Erstelle expandierten Block W // Erstelle expandierten Block W
uint32_t W[64]; uint32_t W[64];
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16); memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
for(int j=16;j<64;j++) for(int j=16;j<64;j++)
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16]; W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
// Initialisiere die register a bis h mit der Hash-Tabelle // Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8]; uint32_t regs[8];
uint32_t hash[8]; uint32_t hash[8];
uint32_t sponge[4]; uint32_t sponge[4];
// pre // pre
memset(sponge, 0, sizeof(uint32_t) * 4); memset(sponge, 0, sizeof(uint32_t) * 4);
for (int k=0; k < 8; k++) for (int k=0; k < 8; k++)
{ {
regs[k] = hefty_cpu_hashTable[k]; regs[k] = hefty_cpu_hashTable[k];
hash[k] = regs[k]; hash[k] = regs[k];
} }
// 1. Runde // 1. Runde
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]); Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
{ {
Absorb(sponge, regs[3] ^ regs[7]); Absorb(sponge, regs[3] ^ regs[7]);
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
} }
for(int j=16;j<64;j++) for(int j=16;j<64;j++)
{ {
Absorb(sponge, regs[3] + regs[7]); Absorb(sponge, regs[3] + regs[7]);
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge); hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
} }
for(int k=0;k<8;k++) for(int k=0;k<8;k++)
hash[k] += regs[k]; hash[k] += regs[k];
// sponge speichern // sponge speichern
cudaMemcpyToSymbol( hefty_gpu_sponge, cudaMemcpyToSymbol( hefty_gpu_sponge,
sponge, sponge,
sizeof(uint32_t) * 4 ); sizeof(uint32_t) * 4 );
// hash speichern // hash speichern
cudaMemcpyToSymbol( hefty_gpu_register, cudaMemcpyToSymbol( hefty_gpu_register,
hash, hash,
sizeof(uint32_t) * 8 ); sizeof(uint32_t) * 8 );
// Blockheader setzen (korrekte Nonce fehlt da drin noch) // Blockheader setzen (korrekte Nonce fehlt da drin noch)
cudaMemcpyToSymbol( hefty_gpu_blockHeader, cudaMemcpyToSymbol( hefty_gpu_blockHeader,
&msgBlock[16], &msgBlock[16],
64); 64);
} }
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{ {
const int threadsperblock = 128; // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) // Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED #if USE_SHARED
size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte size_t shared_size = 8 * 64 * sizeof(uint32_t);
#else #else
size_t shared_size = 0; size_t shared_size = 0;
#endif #endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
} }

View File

@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -5,8 +5,6 @@
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#define W_ALIGNMENT 65
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t; typedef unsigned int uint32_t;
@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
nonceVector[thread] = nounce; nonceVector[thread] = nounce;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory // jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
//extern __shared__ unsigned char s[];
//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
uint32_t W1[16]; uint32_t W1[16];
uint32_t W2[16]; uint32_t W2[16];
@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
{ {
const int threadsperblock = 128; const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl) // Größe des dynamischen Shared Memory Bereichs
//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -163,6 +163,41 @@ extern "C" int cuda_num_devices()
return GPU_N; return GPU_N;
} }
static bool substringsearch(const char *haystack, const char *needle, int &match)
{
int hlen = strlen(haystack);
int nlen = strlen(needle);
for (int i=0; i < hlen; ++i)
{
if (haystack[i] == ' ') continue;
int j=0, x = 0;
while(j < nlen)
{
if (haystack[i+x] == ' ') {++x; continue;}
if (needle[j] == ' ') {++j; continue;}
if (needle[j] == '#') return ++match == needle[j+1]-'0';
if (tolower(haystack[i+x]) != tolower(needle[j])) break;
++j; ++x;
}
if (j == nlen) return true;
}
return false;
}
// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
extern "C" int cuda_finddevice(char *name)
{
int num = cuda_num_devices();
int match = 0;
for (int i=0; i < num; ++i)
{
cudaDeviceProp props;
if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
if (substringsearch(props.name, name, match)) return i;
}
return -1;
}
// Zeitsynchronisations-Routine von cudaminer mit CPU sleep // Zeitsynchronisations-Routine von cudaminer mit CPU sleep
typedef struct { double value[8]; } tsumarray; typedef struct { double value[8]; } tsumarray;
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id) cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)