update to version v0.5 (2014-03-27)
This commit is contained in:
parent
1bb78f0258
commit
2ca6ede92b
@ -34,4 +34,4 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
|
||||
|
||||
# we're now targeting all major compute architectures within one binary.
|
||||
.cu.o:
|
||||
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS
|
||||
|
||||
# we're now targeting all major compute architectures within one binary.
|
||||
.cu.o:
|
||||
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
|
42
README.txt
42
README.txt
@ -1,5 +1,5 @@
|
||||
|
||||
ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release
|
||||
ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization"
|
||||
-------------------------------------------------------------
|
||||
|
||||
***************************************************************
|
||||
@ -38,6 +38,11 @@ its command line interface and options.
|
||||
fugue256 use to mine Fuguecoin
|
||||
groestl use to mine Groestlcoin
|
||||
|
||||
-d, --devices gives a comma separated list of CUDA device IDs
|
||||
to operate on. Device IDs start counting from 0!
|
||||
Alternatively give string names of your card like
|
||||
gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
|
||||
|
||||
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ")
|
||||
-O, --userpass=U:P username:password pair for mining server
|
||||
-u, --user=USERNAME username for mining server
|
||||
@ -63,8 +68,10 @@ its command line interface and options.
|
||||
-V, --version display version information and exit
|
||||
-h, --help display this help text and exit
|
||||
|
||||
|
||||
>>> Examples <<<
|
||||
|
||||
|
||||
Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
|
||||
|
||||
ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
|
||||
@ -107,22 +114,33 @@ from your old clunkers.
|
||||
|
||||
>>> RELEASE HISTORY <<<
|
||||
|
||||
March, 24 2014 fixed Groestl pool support
|
||||
March, 27 2014 Heavycoin exchange rates soar, and as a result this coin
|
||||
gets some love: We greatly optimized the Hefty1 kernel
|
||||
for speed. Expect some hefty gains, especially on 750Ti's!
|
||||
|
||||
went back to Compute 1.x for cuda_hefty1.cu kernel by
|
||||
default after numerous reports of ccminer v0.2/v0.3
|
||||
not working with HeavyCoin for some people.
|
||||
By popular demand, we added the -d option as known from
|
||||
cudaminer.
|
||||
|
||||
March, 23 2014 added Groestlcoin support. stratum status unknown
|
||||
(the only pool is currently down for fixing issues)
|
||||
different compute capability builds are now provided until
|
||||
we figure out how to pack everything into a single executable
|
||||
in a Windows build.
|
||||
|
||||
March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
|
||||
on Fermi and Maxwell devices. Kepler may suffer slightly
|
||||
(3-5%)
|
||||
March, 24 2014 fixed Groestl pool support
|
||||
|
||||
Fixed Stratum for Fuguecoin. Tested on dwarfpool.
|
||||
went back to Compute 1.x for cuda_hefty1.cu kernel by
|
||||
default after numerous reports of ccminer v0.2/v0.3
|
||||
not working with HeavyCoin for some people.
|
||||
|
||||
March, 18 2014 initial release.
|
||||
March, 23 2014 added Groestlcoin support. stratum status unknown
|
||||
(the only pool is currently down for fixing issues)
|
||||
|
||||
March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
|
||||
on Fermi and Maxwell devices. Kepler may suffer slightly
|
||||
(3-5%)
|
||||
|
||||
Fixed Stratum for Fuguecoin. Tested on dwarfpool.
|
||||
|
||||
March, 18 2014 initial release.
|
||||
|
||||
|
||||
>>> AUTHORS <<<
|
||||
|
@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
||||
<CInterleavedPTX>true</CInterleavedPTX>
|
||||
</CudaCompile>
|
||||
<CudaCompile>
|
||||
<MaxRegCount>63</MaxRegCount>
|
||||
<MaxRegCount>80</MaxRegCount>
|
||||
</CudaCompile>
|
||||
<CudaCompile>
|
||||
<PtxAsOptionV>true</PtxAsOptionV>
|
||||
<Keep>true</Keep>
|
||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
||||
<CodeGeneration>compute_35,sm_35</CodeGeneration>
|
||||
<Include>
|
||||
</Include>
|
||||
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
||||
<CInterleavedPTX>true</CInterleavedPTX>
|
||||
</CudaCompile>
|
||||
<CudaCompile>
|
||||
<MaxRegCount>63</MaxRegCount>
|
||||
<MaxRegCount>80</MaxRegCount>
|
||||
</CudaCompile>
|
||||
<CudaCompile>
|
||||
<PtxAsOptionV>true</PtxAsOptionV>
|
||||
<Keep>true</Keep>
|
||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
||||
<CodeGeneration>compute_35,sm_35</CodeGeneration>
|
||||
<Include>
|
||||
</Include>
|
||||
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
||||
<CInterleavedPTX>true</CInterleavedPTX>
|
||||
</CudaCompile>
|
||||
<CudaCompile>
|
||||
<MaxRegCount>63</MaxRegCount>
|
||||
<MaxRegCount>80</MaxRegCount>
|
||||
</CudaCompile>
|
||||
<CudaCompile>
|
||||
<PtxAsOptionV>true</PtxAsOptionV>
|
||||
<Keep>true</Keep>
|
||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
||||
<CodeGeneration>compute_35,sm_35</CodeGeneration>
|
||||
<Include>
|
||||
</Include>
|
||||
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
||||
<CInterleavedPTX>true</CInterleavedPTX>
|
||||
</CudaCompile>
|
||||
<CudaCompile>
|
||||
<MaxRegCount>63</MaxRegCount>
|
||||
<MaxRegCount>80</MaxRegCount>
|
||||
</CudaCompile>
|
||||
<CudaCompile>
|
||||
<PtxAsOptionV>true</PtxAsOptionV>
|
||||
<Keep>true</Keep>
|
||||
<CodeGeneration>compute_20,sm_20</CodeGeneration>
|
||||
<CodeGeneration>compute_35,sm_35</CodeGeneration>
|
||||
<Include>
|
||||
</Include>
|
||||
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
||||
<CudaCompile Include="cuda_fugue256.cu" />
|
||||
<CudaCompile Include="cuda_groestl512.cu" />
|
||||
<CudaCompile Include="cuda_groestlcoin.cu" />
|
||||
<CudaCompile Include="cuda_hefty1.cu">
|
||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
|
||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
|
||||
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">124</MaxRegCount>
|
||||
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">124</MaxRegCount>
|
||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
|
||||
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">124</MaxRegCount>
|
||||
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
|
||||
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">124</MaxRegCount>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="cuda_hefty1.cu" />
|
||||
<CudaCompile Include="cuda_keccak512.cu" />
|
||||
<CudaCompile Include="cuda_sha256.cu" />
|
||||
<CudaCompile Include="heavy.cu" />
|
||||
|
20
configure
vendored
20
configure
vendored
@ -1,6 +1,6 @@
|
||||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24.
|
||||
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27.
|
||||
#
|
||||
#
|
||||
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
|
||||
@ -557,8 +557,8 @@ MAKEFLAGS=
|
||||
# Identity of this package.
|
||||
PACKAGE_NAME='ccminer'
|
||||
PACKAGE_TARNAME='ccminer'
|
||||
PACKAGE_VERSION='2014.03.24'
|
||||
PACKAGE_STRING='ccminer 2014.03.24'
|
||||
PACKAGE_VERSION='2014.03.27'
|
||||
PACKAGE_STRING='ccminer 2014.03.27'
|
||||
PACKAGE_BUGREPORT=''
|
||||
PACKAGE_URL=''
|
||||
|
||||
@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
|
||||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems.
|
||||
\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
@ -1368,7 +1368,7 @@ fi
|
||||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of ccminer 2014.03.24:";;
|
||||
short | recursive ) echo "Configuration of ccminer 2014.03.27:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
@ -1469,7 +1469,7 @@ fi
|
||||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
ccminer configure 2014.03.24
|
||||
ccminer configure 2014.03.27
|
||||
generated by GNU Autoconf 2.68
|
||||
|
||||
Copyright (C) 2010 Free Software Foundation, Inc.
|
||||
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
|
||||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by ccminer $as_me 2014.03.24, which was
|
||||
It was created by ccminer $as_me 2014.03.27, which was
|
||||
generated by GNU Autoconf 2.68. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
@ -2901,7 +2901,7 @@ fi
|
||||
|
||||
# Define the identity of the package.
|
||||
PACKAGE='ccminer'
|
||||
VERSION='2014.03.24'
|
||||
VERSION='2014.03.27'
|
||||
|
||||
|
||||
cat >>confdefs.h <<_ACEOF
|
||||
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
||||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by ccminer $as_me 2014.03.24, which was
|
||||
This file was extended by ccminer $as_me 2014.03.27, which was
|
||||
generated by GNU Autoconf 2.68. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
@ -7184,7 +7184,7 @@ _ACEOF
|
||||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
ccminer config.status 2014.03.24
|
||||
ccminer config.status 2014.03.27
|
||||
configured by $0, generated by GNU Autoconf 2.68,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
AC_INIT([ccminer], [2014.03.24])
|
||||
AC_INIT([ccminer], [2014.03.27])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
50
cpu-miner.c
50
cpu-miner.c
@ -51,8 +51,13 @@
|
||||
// from heavy.cu
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
int cuda_num_devices();
|
||||
int cuda_finddevice(char *name);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __linux /* Linux specific policy and affinity management */
|
||||
@ -144,10 +149,11 @@ static int opt_scantime = 5;
|
||||
static json_t *opt_config;
|
||||
static const bool opt_time = true;
|
||||
static sha256_algos opt_algo = ALGO_HEAVY;
|
||||
static int opt_n_threads;
|
||||
static int opt_n_threads = 0;
|
||||
bool opt_trust_pool = false;
|
||||
uint16_t opt_vote = 9999;
|
||||
static int num_processors;
|
||||
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
|
||||
static char *rpc_url;
|
||||
static char *rpc_userpass;
|
||||
static char *rpc_user, *rpc_pass;
|
||||
@ -185,7 +191,11 @@ Options:\n\
|
||||
-a, --algo=ALGO specify the algorithm to use\n\
|
||||
fugue256 Fuguecoin hash\n\
|
||||
heavy Heavycoin hash\n\
|
||||
-v, --vote=VOTE block reward vote\n\
|
||||
-d, --devices takes a comma separated list of CUDA devices to use.\n\
|
||||
Device IDs start counting from 0! Alternatively takes\n\
|
||||
string names of your cards like gtx780ti or gt640#2\n\
|
||||
(matching 2nd gt640 in the PC)\n\
|
||||
-v, --vote=VOTE block reward vote (for HeavyCoin)\n\
|
||||
-m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
|
||||
-o, --url=URL URL of mining server\n\
|
||||
-O, --userpass=U:P username:password pair for mining server\n\
|
||||
@ -227,7 +237,7 @@ static char const short_options[] =
|
||||
#ifdef HAVE_SYSLOG_H
|
||||
"S"
|
||||
#endif
|
||||
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:";
|
||||
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
|
||||
|
||||
static struct option const options[] = {
|
||||
{ "algo", 1, NULL, 'a' },
|
||||
@ -259,6 +269,7 @@ static struct option const options[] = {
|
||||
{ "user", 1, NULL, 'u' },
|
||||
{ "userpass", 1, NULL, 'O' },
|
||||
{ "version", 0, NULL, 'V' },
|
||||
{ "devices", 1, NULL, 'd' },
|
||||
{ 0, 0, 0, 0 }
|
||||
};
|
||||
|
||||
@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg)
|
||||
case 'S':
|
||||
use_syslog = true;
|
||||
break;
|
||||
case 'd': // CB
|
||||
{
|
||||
char * pch = strtok (arg,",");
|
||||
opt_n_threads = 0;
|
||||
while (pch != NULL) {
|
||||
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
|
||||
{
|
||||
if (atoi(pch) < num_processors)
|
||||
device_map[opt_n_threads++] = atoi(pch);
|
||||
else {
|
||||
applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
int device = cuda_finddevice(pch);
|
||||
if (device >= 0 && device < num_processors)
|
||||
device_map[opt_n_threads++] = device;
|
||||
else {
|
||||
applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
pch = strtok (NULL, ",");
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'V':
|
||||
show_version_and_exit();
|
||||
case 'h':
|
||||
@ -1346,7 +1383,7 @@ static void signal_handler(int sig)
|
||||
}
|
||||
#endif
|
||||
|
||||
#define PROGRAM_VERSION "0.4"
|
||||
#define PROGRAM_VERSION "0.5"
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
struct thr_info *thr;
|
||||
@ -1370,6 +1407,9 @@ int main(int argc, char *argv[])
|
||||
rpc_user = strdup("");
|
||||
rpc_pass = strdup("");
|
||||
|
||||
pthread_mutex_init(&applog_lock, NULL);
|
||||
num_processors = cuda_num_devices();
|
||||
|
||||
/* parse command line */
|
||||
parse_cmdline(argc, argv);
|
||||
|
||||
@ -1385,7 +1425,6 @@ int main(int argc, char *argv[])
|
||||
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
|
||||
}
|
||||
|
||||
pthread_mutex_init(&applog_lock, NULL);
|
||||
pthread_mutex_init(&stats_lock, NULL);
|
||||
pthread_mutex_init(&g_work_lock, NULL);
|
||||
pthread_mutex_init(&stratum.sock_lock, NULL);
|
||||
@ -1416,7 +1455,6 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
#endif
|
||||
|
||||
num_processors = cuda_num_devices();
|
||||
if (num_processors == 0)
|
||||
{
|
||||
applog(LOG_ERR, "No CUDA devices found! terminating.");
|
||||
|
@ -152,7 +152,7 @@
|
||||
#define PACKAGE_NAME "ccminer"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "ccminer 2014.03.24"
|
||||
#define PACKAGE_STRING "ccminer 2014.03.27"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
@ -161,7 +161,7 @@
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "2014.03.24"
|
||||
#define PACKAGE_VERSION "2014.03.27"
|
||||
|
||||
/* If using the C implementation of alloca, define if you know the
|
||||
direction of stack growth for your system; otherwise it will be
|
||||
|
@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata)
|
||||
|
||||
__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||
{
|
||||
const int threadsperblock = 128;
|
||||
const int threadsperblock = 256;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
@ -9,7 +9,10 @@
|
||||
|
||||
#define USE_SHARED 1
|
||||
|
||||
// heavy.cu
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
|
||||
|
||||
void fugue256_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaSetDevice(thr_id);
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
|
||||
@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
#if USE_SHARED
|
||||
size_t shared_size = 4 * 256 * sizeof(uint32_t);
|
||||
#else
|
||||
|
@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
@ -9,6 +9,10 @@
|
||||
|
||||
#define USE_SHARED 1
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
// aus heavy.cu
|
||||
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
@ -20,13 +24,7 @@ typedef unsigned long long uint64_t;
|
||||
__constant__ uint32_t pTarget[8]; // Single GPU
|
||||
extern uint32_t *d_resultNonce[8];
|
||||
|
||||
// globaler Speicher für unsere Ergebnisse
|
||||
uint32_t *d_hashGROESTLCOINoutput[8];
|
||||
|
||||
__constant__ uint32_t groestlcoin_gpu_state[32];
|
||||
__constant__ uint32_t groestlcoin_gpu_msg[32];
|
||||
__constant__ uint32_t sha256coin_gpu_constantTable[64];
|
||||
__constant__ uint32_t sha256coin_gpu_register[8];
|
||||
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
|
||||
@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[];
|
||||
extern uint32_t T3up_cpu[];
|
||||
extern uint32_t T3dn_cpu[];
|
||||
|
||||
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||
#if __CUDA_ARCH__ < 350
|
||||
// Kepler (Compute 3.0)
|
||||
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||
#else
|
||||
// Kepler (Compute 3.5)
|
||||
#define S(x, n) __funnelshift_r( x, x, n );
|
||||
#endif
|
||||
#define R(x, n) ((x) >> (n))
|
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||
@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[];
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
|
||||
|
||||
__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
|
||||
__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
|
||||
{
|
||||
uint32_t t[32];
|
||||
|
||||
//#pragma unroll 14
|
||||
for(int r=0;r<14;r++)
|
||||
{
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++)
|
||||
switch(r)
|
||||
{
|
||||
a[(k*2)+0] ^= PC32up(k * 0x10, r);
|
||||
//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
|
||||
case 0:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
|
||||
case 1:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
|
||||
case 2:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
|
||||
case 3:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
|
||||
case 4:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
|
||||
case 5:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
|
||||
case 6:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
|
||||
case 7:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
|
||||
case 8:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
|
||||
case 9:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
|
||||
case 10:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
|
||||
case 11:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
|
||||
case 12:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
|
||||
case 13:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
|
||||
}
|
||||
|
||||
// RBTT
|
||||
@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
|
||||
__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
|
||||
{
|
||||
//#pragma unroll 14
|
||||
for(int r=0;r<14;r++)
|
||||
{
|
||||
uint32_t t[32];
|
||||
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++)
|
||||
switch(r)
|
||||
{
|
||||
a[(k*2)+0] ^= QC32up(k * 0x10, r);
|
||||
a[(k*2)+1] ^= QC32dn(k * 0x10, r);
|
||||
case 0:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
|
||||
case 1:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
|
||||
case 2:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
|
||||
case 3:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
|
||||
case 4:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
|
||||
case 5:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
|
||||
case 6:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
|
||||
case 7:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
|
||||
case 8:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
|
||||
case 9:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
|
||||
case 10:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
|
||||
case 11:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
|
||||
case 12:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
|
||||
case 13:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
|
||||
}
|
||||
|
||||
// RBTT
|
||||
@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
|
||||
}
|
||||
}
|
||||
#if USE_SHARED
|
||||
__global__ void __launch_bounds__(256)
|
||||
__global__ void /* __launch_bounds__(256) */
|
||||
#else
|
||||
__global__ void
|
||||
#endif
|
||||
|
||||
groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
|
||||
groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
|
||||
{
|
||||
#if USE_SHARED
|
||||
extern __shared__ char mixtabs[];
|
||||
@ -204,146 +286,111 @@ __global__ void
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
/////
|
||||
///// Lieber groestl, mach, dass es abgeht!!!
|
||||
/////
|
||||
// GROESTL
|
||||
uint32_t message[32];
|
||||
uint32_t state[32];
|
||||
uint32_t g[32];
|
||||
|
||||
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++)
|
||||
{
|
||||
// TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem
|
||||
// constant Memory zu lesen. Das ist Verschwendung von Bandbreite.
|
||||
state[k] = groestlcoin_gpu_state[k];
|
||||
message[k] = groestlcoin_gpu_msg[k];
|
||||
}
|
||||
for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
|
||||
|
||||
uint32_t nounce = startNounce + thread;
|
||||
message[19] = SWAB32(nounce);
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++)
|
||||
g[u] = message[u] ^ state[u]; // TODO: state ist fast ueberall 0.
|
||||
for(int u=0;u<32;u++) state[u] = message[u];
|
||||
state[31] ^= 0x20000;
|
||||
|
||||
// Perm
|
||||
#if USE_SHARED
|
||||
groestlcoin_perm_P(g, mixtabs); // TODO: g[] entspricht fast genau message[]
|
||||
groestlcoin_perm_Q(message, mixtabs); // kann man das ausnutzen?
|
||||
groestlcoin_perm_P(state, mixtabs);
|
||||
state[31] ^= 0x20000;
|
||||
groestlcoin_perm_Q(message, mixtabs);
|
||||
#else
|
||||
groestlcoin_perm_P(g, NULL);
|
||||
groestlcoin_perm_P(state, NULL);
|
||||
state[31] ^= 0x20000;
|
||||
groestlcoin_perm_Q(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++)
|
||||
{
|
||||
// TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion
|
||||
// was den Registerbedarf senken koennte?
|
||||
state[u] ^= g[u] ^ message[u];
|
||||
g[u] = state[u];
|
||||
}
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) message[u] = state[u];
|
||||
|
||||
#if USE_SHARED
|
||||
groestlcoin_perm_P(g, mixtabs);
|
||||
groestlcoin_perm_P(message, mixtabs);
|
||||
#else
|
||||
groestlcoin_perm_P(g, NULL);
|
||||
groestlcoin_perm_P(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++)
|
||||
state[u] ^= g[u];
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
////
|
||||
//// 2. Runde groestl
|
||||
////
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++)
|
||||
message[k] = state[k + 16];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++)
|
||||
state[k] = groestlcoin_gpu_state[k];
|
||||
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++)
|
||||
for(int k=0;k<16;k++) message[k] = state[k + 16];
|
||||
#pragma unroll 14
|
||||
for(int k=1;k<15;k++)
|
||||
message[k+16] = 0;
|
||||
|
||||
message[16] = 0x80;
|
||||
message[16] = 0x80;
|
||||
message[31] = 0x01000000;
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++)
|
||||
g[u] = message[u] ^ state[u];
|
||||
state[u] = message[u];
|
||||
state[31] ^= 0x20000;
|
||||
|
||||
// Perm
|
||||
#if USE_SHARED
|
||||
groestlcoin_perm_P(g, mixtabs);
|
||||
groestlcoin_perm_P(state, mixtabs);
|
||||
state[31] ^= 0x20000;
|
||||
groestlcoin_perm_Q(message, mixtabs);
|
||||
#else
|
||||
groestlcoin_perm_P(g, NULL);
|
||||
groestlcoin_perm_P(state, NULL);
|
||||
state[31] ^= 0x20000;
|
||||
groestlcoin_perm_Q(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++)
|
||||
{
|
||||
state[u] ^= g[u] ^ message[u];
|
||||
g[u] = state[u];
|
||||
}
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) message[u] = state[u];
|
||||
|
||||
#if USE_SHARED
|
||||
groestlcoin_perm_P(g, mixtabs);
|
||||
groestlcoin_perm_P(message, mixtabs);
|
||||
#else
|
||||
groestlcoin_perm_P(g, NULL);
|
||||
groestlcoin_perm_P(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++)
|
||||
state[u] ^= g[u];
|
||||
|
||||
/*
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] = state[k+16];
|
||||
*/
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
// kopiere Ergebnis
|
||||
/*
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++)
|
||||
((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
|
||||
*/
|
||||
int i;
|
||||
int i, position = -1;
|
||||
bool rc = true;
|
||||
|
||||
|
||||
#pragma unroll 8
|
||||
for (i = 7; i >= 0; i--) {
|
||||
if (state[i+16] > pTarget[i]) {
|
||||
rc = false;
|
||||
break;
|
||||
}
|
||||
if (state[i+16] < pTarget[i]) {
|
||||
rc = true;
|
||||
break;
|
||||
}
|
||||
if(position < i) {
|
||||
position = i;
|
||||
rc = false;
|
||||
}
|
||||
}
|
||||
if (state[i+16] < pTarget[i]) {
|
||||
if(position < i) {
|
||||
position = i;
|
||||
rc = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(rc == true)
|
||||
{
|
||||
if(resNounce[0] > nounce)
|
||||
{
|
||||
resNounce[0] = nounce;
|
||||
/*
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
((uint32_t*)outputHash)[k] = (hash[k]);
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -360,7 +407,7 @@ __global__ void
|
||||
// Setup-Funktionen
|
||||
__host__ void groestlcoin_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaSetDevice(thr_id);
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
|
||||
// Texturen mit obigem Makro initialisieren
|
||||
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
|
||||
@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
|
||||
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
|
||||
|
||||
// setze register
|
||||
// TODO: fast vollstaendige Vorbelegung mit Nullen.
|
||||
// da besteht doch Optimierungspotenzial im GPU Kernel
|
||||
// denn mit Nullen braucht man nicht wirklich rechnen.
|
||||
uint32_t groestl_state_init[32];
|
||||
memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
|
||||
groestl_state_init[31] = 0x20000;
|
||||
|
||||
// state speichern
|
||||
cudaMemcpyToSymbol( groestlcoin_gpu_state,
|
||||
groestl_state_init,
|
||||
128);
|
||||
|
||||
// Speicher für Gewinner-Nonce belegen
|
||||
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
||||
|
||||
// Speicher für alle Ergebnisse belegen (nur für Debug)
|
||||
cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
|
||||
}
|
||||
|
||||
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
|
||||
@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
#if USE_SHARED
|
||||
size_t shared_size = 8 * 256 * sizeof(uint32_t);
|
||||
#else
|
||||
@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
//fprintf(stderr, "ThrID: %d\n", thr_id);
|
||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]);
|
||||
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
MyStreamSynchronize(NULL, 0, thr_id);
|
||||
|
||||
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
|
||||
/// Debug
|
||||
//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
|
||||
|
||||
// Nounce
|
||||
//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
521
cuda_hefty1.cu
521
cuda_hefty1.cu
@ -2,26 +2,40 @@
|
||||
#include "cuda_runtime.h"
|
||||
#include "device_launch_parameters.h"
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#define USE_SHARED 1
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props;
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
uint32_t *d_heftyHashes[8];
|
||||
|
||||
/* Hash-Tabellen */
|
||||
__constant__ uint32_t hefty_gpu_constantTable[64];
|
||||
#if USE_SHARED
|
||||
#define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
|
||||
#else
|
||||
#define heftyLookUp(x) hefty_gpu_constantTable[x]
|
||||
#endif
|
||||
|
||||
// muss expandiert werden
|
||||
__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
|
||||
__constant__ uint32_t hefty_gpu_register[8];
|
||||
__constant__ uint32_t hefty_gpu_sponge[4];
|
||||
|
||||
uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
|
||||
uint32_t hefty_cpu_hashTable[] = {
|
||||
0x6a09e667UL,
|
||||
0xbb67ae85UL,
|
||||
0x3c6ef372UL,
|
||||
0xa54ff53aUL,
|
||||
@ -29,8 +43,9 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
|
||||
0x9b05688cUL,
|
||||
0x1f83d9abUL,
|
||||
0x5be0cd19UL };
|
||||
|
||||
uint32_t hefty_cpu_constantTable[] = {
|
||||
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
|
||||
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
|
||||
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
|
||||
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
|
||||
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
|
||||
@ -48,350 +63,352 @@ uint32_t hefty_cpu_constantTable[] = {
|
||||
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
|
||||
};
|
||||
|
||||
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||
#define R(x, n) ((x) >> (n))
|
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
|
||||
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
|
||||
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
|
||||
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
|
||||
//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||
static __host__ __device__ uint32_t S(uint32_t x, int n)
|
||||
{
|
||||
return (((x) >> (n)) | ((x) << (32 - (n))));
|
||||
}
|
||||
#define R(x, n) ((x) >> (n))
|
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
|
||||
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
|
||||
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
|
||||
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
|
||||
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
|
||||
// uint8_t
|
||||
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
|
||||
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
|
||||
__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
|
||||
{
|
||||
uint16_t w = (x >> 16) ^ (x & 0xffff);
|
||||
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
|
||||
return (n >> 2) ^ (n & 0x03);
|
||||
uint16_t w = (x >> 16) ^ (x & 0xffff);
|
||||
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
|
||||
return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
|
||||
}
|
||||
// 4 auf einmal
|
||||
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
|
||||
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
|
||||
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
|
||||
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
|
||||
|
||||
__host__ __device__ void Mangle(uint32_t *inp)
|
||||
__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
|
||||
{
|
||||
uint32_t r = smoosh4Quad(inp[0]);
|
||||
//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) );
|
||||
//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) );
|
||||
//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) );
|
||||
//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) );
|
||||
uint32_t r = smoosh4Quad(inp[0]);
|
||||
uint32_t inp0org;
|
||||
uint32_t tmp0Mask, tmp1Mask;
|
||||
uint32_t in1, in2, isAddition;
|
||||
uint32_t tmp;
|
||||
uint8_t b;
|
||||
|
||||
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
|
||||
|
||||
switch (smoosh2(inp[1])) {
|
||||
case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break;
|
||||
case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break;
|
||||
case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break;
|
||||
case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break;
|
||||
}
|
||||
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
|
||||
|
||||
r += 0x01010101;
|
||||
tmp = smoosh2(inp[1]);
|
||||
b = getByte(r,tmp);
|
||||
inp0org = S(inp[0], b);
|
||||
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
|
||||
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
|
||||
|
||||
uint32_t tmp = smoosh2(inp[1] ^ inp[2]);
|
||||
switch (tmp) {
|
||||
case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break;
|
||||
case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break;
|
||||
case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break;
|
||||
case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break;
|
||||
}
|
||||
in1 = (inp[2] & ~inp0org) |
|
||||
(tmp1Mask & ~inp[2] & inp0org) |
|
||||
(~tmp0Mask & ~inp[2] & inp0org);
|
||||
in2 = inp[2] += ~inp0org;
|
||||
isAddition = ~tmp0Mask & tmp1Mask;
|
||||
inp[2] = isAddition ? in2 : in1;
|
||||
|
||||
r += 0x01010101;
|
||||
tmp = smoosh2(inp[1] ^ inp[2]);
|
||||
b = getByte(r,tmp);
|
||||
inp0org = S(inp[0], b);
|
||||
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
|
||||
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
|
||||
|
||||
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
|
||||
in1 = (inp[3] & ~inp0org) |
|
||||
(tmp1Mask & ~inp[3] & inp0org) |
|
||||
(~tmp0Mask & ~inp[3] & inp0org);
|
||||
in2 = inp[3] += ~inp0org;
|
||||
isAddition = ~tmp0Mask & tmp1Mask;
|
||||
inp[3] = isAddition ? in2 : in1;
|
||||
|
||||
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
|
||||
}
|
||||
|
||||
__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
|
||||
{
|
||||
inp[0] ^= x;
|
||||
Mangle(inp);
|
||||
inp[0] ^= x;
|
||||
Mangle(inp);
|
||||
}
|
||||
|
||||
__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
|
||||
{
|
||||
uint32_t y = inp[0];
|
||||
Mangle(inp);
|
||||
return y;
|
||||
uint32_t y = inp[0];
|
||||
Mangle(inp);
|
||||
return y;
|
||||
}
|
||||
|
||||
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
|
||||
{
|
||||
uint32_t r = Squeeze(sponge);
|
||||
uint32_t r = Squeeze(sponge);
|
||||
uint32_t t = ((r >> 8) & 0x1F);
|
||||
uint32_t y = 1 << t;
|
||||
|
||||
//uint8_t r0 = r >> 8;
|
||||
uint8_t r1 = r & 0xFF;
|
||||
uint32_t y = 1 << ((r >> 8) & 0x1F);
|
||||
uint32_t a = (((r>>1) & 0x01) << t) & y;
|
||||
uint32_t b = ((r & 0x01) << t) & y;
|
||||
uint32_t c = x & y;
|
||||
|
||||
//uint32_t retVal;
|
||||
//retVal = x;
|
||||
|
||||
uint32_t resArr[4];
|
||||
resArr[0] = x;
|
||||
resArr[1] = x & ~y;
|
||||
resArr[2] = x | y;
|
||||
resArr[3] = x ^ y;
|
||||
return resArr[r1 & 0x03];
|
||||
|
||||
/*
|
||||
switch(r1 & 0x03)
|
||||
{
|
||||
case 0:
|
||||
break;
|
||||
case 1:
|
||||
retVal = x & ~y;
|
||||
break;
|
||||
case 2:
|
||||
retVal = x | y;
|
||||
break;
|
||||
case 3:
|
||||
retVal = x ^ y;
|
||||
break;
|
||||
}
|
||||
return retVal;
|
||||
*/
|
||||
uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
|
||||
return retVal;
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
||||
{
|
||||
uint32_t tmpBr;
|
||||
uint32_t tmpBr;
|
||||
|
||||
uint32_t brG = Br(sponge, regs[6]);
|
||||
uint32_t brF = Br(sponge, regs[5]);
|
||||
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
||||
uint32_t brE = Br(sponge, regs[4]);
|
||||
uint32_t tmp2 = tmp1 + S1(brE);
|
||||
uint32_t brC = Br(sponge, regs[2]);
|
||||
uint32_t brB = Br(sponge, regs[1]);
|
||||
uint32_t brA = Br(sponge, regs[0]);
|
||||
uint32_t tmp3 = Maj(brA, brB, brC);
|
||||
tmpBr = Br(sponge, regs[0]);
|
||||
uint32_t tmp4 = tmp3 + S0(tmpBr);
|
||||
tmpBr = Br(sponge, tmp2);
|
||||
uint32_t brG = Br(sponge, regs[6]);
|
||||
uint32_t brF = Br(sponge, regs[5]);
|
||||
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
||||
uint32_t brE = Br(sponge, regs[4]);
|
||||
uint32_t tmp2 = tmp1 + S1(brE);
|
||||
uint32_t brC = Br(sponge, regs[2]);
|
||||
uint32_t brB = Br(sponge, regs[1]);
|
||||
uint32_t brA = Br(sponge, regs[0]);
|
||||
uint32_t tmp3 = Maj(brA, brB, brC);
|
||||
tmpBr = Br(sponge, regs[0]);
|
||||
uint32_t tmp4 = tmp3 + S0(tmpBr);
|
||||
tmpBr = Br(sponge, tmp2);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||
regs[0] = tmp2 + tmp4;
|
||||
regs[4] += tmpBr;
|
||||
#pragma unroll 7
|
||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||
regs[0] = tmp2 + tmp4;
|
||||
regs[4] += tmpBr;
|
||||
}
|
||||
|
||||
__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
|
||||
{
|
||||
uint32_t tmpBr;
|
||||
uint32_t tmpBr;
|
||||
|
||||
uint32_t brG = Br(sponge, regs[6]);
|
||||
uint32_t brF = Br(sponge, regs[5]);
|
||||
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
||||
uint32_t brE = Br(sponge, regs[4]);
|
||||
uint32_t tmp2 = tmp1 + S1(brE);
|
||||
uint32_t brC = Br(sponge, regs[2]);
|
||||
uint32_t brB = Br(sponge, regs[1]);
|
||||
uint32_t brA = Br(sponge, regs[0]);
|
||||
uint32_t tmp3 = Maj(brA, brB, brC);
|
||||
tmpBr = Br(sponge, regs[0]);
|
||||
uint32_t tmp4 = tmp3 + S0(tmpBr);
|
||||
tmpBr = Br(sponge, tmp2);
|
||||
uint32_t brG = Br(sponge, regs[6]);
|
||||
uint32_t brF = Br(sponge, regs[5]);
|
||||
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
|
||||
uint32_t brE = Br(sponge, regs[4]);
|
||||
uint32_t tmp2 = tmp1 + S1(brE);
|
||||
uint32_t brC = Br(sponge, regs[2]);
|
||||
uint32_t brB = Br(sponge, regs[1]);
|
||||
uint32_t brA = Br(sponge, regs[0]);
|
||||
uint32_t tmp3 = Maj(brA, brB, brC);
|
||||
tmpBr = Br(sponge, regs[0]);
|
||||
uint32_t tmp4 = tmp3 + S0(tmpBr);
|
||||
tmpBr = Br(sponge, tmp2);
|
||||
|
||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||
regs[0] = tmp2 + tmp4;
|
||||
regs[4] += tmpBr;
|
||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||
regs[0] = tmp2 + tmp4;
|
||||
regs[4] += tmpBr;
|
||||
}
|
||||
|
||||
// Die Hash-Funktion
|
||||
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
|
||||
{
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
// bestimme den aktuellen Zähler
|
||||
uint32_t nounce = startNounce + thread;
|
||||
|
||||
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
||||
#if USE_SHARED
|
||||
extern __shared__ unsigned char s[];
|
||||
uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
|
||||
#else
|
||||
// reduktion von 256 byte auf 128 byte
|
||||
uint32_t W1[16];
|
||||
uint32_t W2[16];
|
||||
#if USE_SHARED
|
||||
extern __shared__ char heftytab[];
|
||||
if(threadIdx.x < 64)
|
||||
{
|
||||
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
// Initialisiere die register a bis h mit der Hash-Tabelle
|
||||
uint32_t regs[8];
|
||||
uint32_t hash[8];
|
||||
uint32_t sponge[4];
|
||||
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
// bestimme den aktuellen Zähler
|
||||
uint32_t nounce = startNounce + thread;
|
||||
|
||||
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
||||
// reduktion von 256 byte auf 128 byte
|
||||
uint32_t W1[16];
|
||||
uint32_t W2[16];
|
||||
|
||||
// Initialisiere die register a bis h mit der Hash-Tabelle
|
||||
uint32_t regs[8];
|
||||
uint32_t hash[8];
|
||||
uint32_t sponge[4];
|
||||
|
||||
#pragma unroll 4
|
||||
for(int k=0; k < 4; k++)
|
||||
sponge[k] = hefty_gpu_sponge[k];
|
||||
for(int k=0; k < 4; k++)
|
||||
sponge[k] = hefty_gpu_sponge[k];
|
||||
|
||||
// pre
|
||||
// pre
|
||||
#pragma unroll 8
|
||||
for (int k=0; k < 8; k++)
|
||||
{
|
||||
regs[k] = hefty_gpu_register[k];
|
||||
hash[k] = regs[k];
|
||||
}
|
||||
|
||||
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
|
||||
for (int k=0; k < 8; k++)
|
||||
{
|
||||
regs[k] = hefty_gpu_register[k];
|
||||
hash[k] = regs[k];
|
||||
}
|
||||
|
||||
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++)
|
||||
W1[k] = hefty_gpu_blockHeader[k];
|
||||
W1[3] = SWAB32(nounce);
|
||||
for(int k=0;k<16;k++)
|
||||
W1[k] = hefty_gpu_blockHeader[k];
|
||||
W1[3] = SWAB32(nounce);
|
||||
|
||||
|
||||
// 2. Runde
|
||||
// 2. Runde
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]);
|
||||
for(int j=0;j<16;j++)
|
||||
Absorb(sponge, W1[j] ^ heftyLookUp(j));
|
||||
|
||||
// Progress W1 (Bytes 0...63)
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
Absorb(sponge, regs[3] ^ regs[7]);
|
||||
hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge);
|
||||
}
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
Absorb(sponge, regs[3] ^ regs[7]);
|
||||
hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
|
||||
}
|
||||
|
||||
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
|
||||
|
||||
|
||||
#pragma unroll 3
|
||||
for(int k=0;k<3;k++)
|
||||
{
|
||||
#pragma unroll 2
|
||||
for(int j=0;j<2;j++)
|
||||
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
#pragma unroll 5
|
||||
for(int j=2;j<7;j++)
|
||||
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
for(int k=0;k<3;k++)
|
||||
{
|
||||
#pragma unroll 2
|
||||
for(int j=0;j<2;j++)
|
||||
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
#pragma unroll 5
|
||||
for(int j=2;j<7;j++)
|
||||
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
|
||||
#pragma unroll 8
|
||||
for(int j=7;j<15;j++)
|
||||
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
|
||||
#pragma unroll 8
|
||||
for(int j=7;j<15;j++)
|
||||
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
|
||||
|
||||
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
|
||||
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
|
||||
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
Absorb(sponge, regs[3] + regs[7]);
|
||||
hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge);
|
||||
}
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
W1[j] = W2[j];
|
||||
}
|
||||
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
Absorb(sponge, regs[3] + regs[7]);
|
||||
hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
|
||||
}
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
W1[j] = W2[j];
|
||||
}
|
||||
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] += regs[k];
|
||||
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] += regs[k];
|
||||
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
|
||||
}
|
||||
for(int k=0;k<8;k++)
|
||||
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
|
||||
}
|
||||
}
|
||||
|
||||
// Setup-Funktionen
|
||||
__host__ void hefty_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaSetDevice(thr_id);
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||
cudaMemcpyToSymbol( hefty_gpu_constantTable,
|
||||
hefty_cpu_constantTable,
|
||||
sizeof(uint32_t) * 64 );
|
||||
cudaGetDeviceProperties(&props, device_map[thr_id]);
|
||||
|
||||
// Speicher für alle Hefty1 hashes belegen
|
||||
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
|
||||
// Kopiere die Hash-Tabellen in den GPU-Speicher
|
||||
cudaMemcpyToSymbol( hefty_gpu_constantTable,
|
||||
hefty_cpu_constantTable,
|
||||
sizeof(uint32_t) * 64 );
|
||||
|
||||
// Speicher für alle Hefty1 hashes belegen
|
||||
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
|
||||
}
|
||||
|
||||
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
|
||||
// data muss 84-Byte haben!
|
||||
// data muss 84-Byte haben!
|
||||
{
|
||||
// Nachricht expandieren und setzen
|
||||
uint32_t msgBlock[32];
|
||||
// Nachricht expandieren und setzen
|
||||
uint32_t msgBlock[32];
|
||||
|
||||
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
||||
memcpy(&msgBlock[0], data, 84);
|
||||
msgBlock[21] |= 0x80;
|
||||
msgBlock[31] = 672; // bitlen
|
||||
|
||||
for(int i=0;i<31;i++) // Byteorder drehen
|
||||
msgBlock[i] = SWAB32(msgBlock[i]);
|
||||
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
||||
memcpy(&msgBlock[0], data, 84);
|
||||
msgBlock[21] |= 0x80;
|
||||
msgBlock[31] = 672; // bitlen
|
||||
|
||||
for(int i=0;i<31;i++) // Byteorder drehen
|
||||
msgBlock[i] = SWAB32(msgBlock[i]);
|
||||
|
||||
// die erste Runde wird auf der CPU durchgeführt, da diese für
|
||||
// alle Threads gleich ist. Der Hash wird dann an die Threads
|
||||
// übergeben
|
||||
// die erste Runde wird auf der CPU durchgeführt, da diese für
|
||||
// alle Threads gleich ist. Der Hash wird dann an die Threads
|
||||
// übergeben
|
||||
|
||||
// Erstelle expandierten Block W
|
||||
uint32_t W[64];
|
||||
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
|
||||
for(int j=16;j<64;j++)
|
||||
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
||||
// Erstelle expandierten Block W
|
||||
uint32_t W[64];
|
||||
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
|
||||
for(int j=16;j<64;j++)
|
||||
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
|
||||
|
||||
// Initialisiere die register a bis h mit der Hash-Tabelle
|
||||
uint32_t regs[8];
|
||||
uint32_t hash[8];
|
||||
uint32_t sponge[4];
|
||||
// Initialisiere die register a bis h mit der Hash-Tabelle
|
||||
uint32_t regs[8];
|
||||
uint32_t hash[8];
|
||||
uint32_t sponge[4];
|
||||
|
||||
// pre
|
||||
memset(sponge, 0, sizeof(uint32_t) * 4);
|
||||
// pre
|
||||
memset(sponge, 0, sizeof(uint32_t) * 4);
|
||||
for (int k=0; k < 8; k++)
|
||||
{
|
||||
regs[k] = hefty_cpu_hashTable[k];
|
||||
hash[k] = regs[k];
|
||||
}
|
||||
{
|
||||
regs[k] = hefty_cpu_hashTable[k];
|
||||
hash[k] = regs[k];
|
||||
}
|
||||
|
||||
// 1. Runde
|
||||
for(int j=0;j<16;j++)
|
||||
Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
|
||||
// 1. Runde
|
||||
for(int j=0;j<16;j++)
|
||||
Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
|
||||
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
Absorb(sponge, regs[3] ^ regs[7]);
|
||||
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
|
||||
}
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
Absorb(sponge, regs[3] ^ regs[7]);
|
||||
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
|
||||
}
|
||||
|
||||
for(int j=16;j<64;j++)
|
||||
{
|
||||
Absorb(sponge, regs[3] + regs[7]);
|
||||
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
|
||||
}
|
||||
for(int j=16;j<64;j++)
|
||||
{
|
||||
Absorb(sponge, regs[3] + regs[7]);
|
||||
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
|
||||
}
|
||||
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] += regs[k];
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] += regs[k];
|
||||
|
||||
// sponge speichern
|
||||
// sponge speichern
|
||||
|
||||
cudaMemcpyToSymbol( hefty_gpu_sponge,
|
||||
sponge,
|
||||
sizeof(uint32_t) * 4 );
|
||||
// hash speichern
|
||||
cudaMemcpyToSymbol( hefty_gpu_register,
|
||||
hash,
|
||||
sizeof(uint32_t) * 8 );
|
||||
cudaMemcpyToSymbol( hefty_gpu_sponge,
|
||||
sponge,
|
||||
sizeof(uint32_t) * 4 );
|
||||
// hash speichern
|
||||
cudaMemcpyToSymbol( hefty_gpu_register,
|
||||
hash,
|
||||
sizeof(uint32_t) * 8 );
|
||||
|
||||
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
|
||||
cudaMemcpyToSymbol( hefty_gpu_blockHeader,
|
||||
&msgBlock[16],
|
||||
64);
|
||||
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
|
||||
cudaMemcpyToSymbol( hefty_gpu_blockHeader,
|
||||
&msgBlock[16],
|
||||
64);
|
||||
}
|
||||
|
||||
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
|
||||
{
|
||||
const int threadsperblock = 128;
|
||||
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
|
||||
// alle anderen mit 512 Threads.
|
||||
int threadsperblock = (props.major >= 3) ? 768 : 512;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
||||
#if USE_SHARED
|
||||
size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
#if USE_SHARED
|
||||
size_t shared_size = 8 * 64 * sizeof(uint32_t);
|
||||
#else
|
||||
size_t shared_size = 0;
|
||||
size_t shared_size = 0;
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
|
||||
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
|
||||
}
|
||||
|
@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
@ -5,8 +5,6 @@
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
#define W_ALIGNMENT 65
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
|
||||
nonceVector[thread] = nounce;
|
||||
|
||||
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
|
||||
//extern __shared__ unsigned char s[];
|
||||
//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
|
||||
uint32_t W1[16];
|
||||
uint32_t W2[16];
|
||||
|
||||
@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
|
||||
|
||||
__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
|
||||
{
|
||||
const int threadsperblock = 128;
|
||||
const int threadsperblock = 256;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
|
||||
//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
35
heavy.cu
35
heavy.cu
@ -163,6 +163,41 @@ extern "C" int cuda_num_devices()
|
||||
return GPU_N;
|
||||
}
|
||||
|
||||
static bool substringsearch(const char *haystack, const char *needle, int &match)
|
||||
{
|
||||
int hlen = strlen(haystack);
|
||||
int nlen = strlen(needle);
|
||||
for (int i=0; i < hlen; ++i)
|
||||
{
|
||||
if (haystack[i] == ' ') continue;
|
||||
int j=0, x = 0;
|
||||
while(j < nlen)
|
||||
{
|
||||
if (haystack[i+x] == ' ') {++x; continue;}
|
||||
if (needle[j] == ' ') {++j; continue;}
|
||||
if (needle[j] == '#') return ++match == needle[j+1]-'0';
|
||||
if (tolower(haystack[i+x]) != tolower(needle[j])) break;
|
||||
++j; ++x;
|
||||
}
|
||||
if (j == nlen) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
|
||||
extern "C" int cuda_finddevice(char *name)
|
||||
{
|
||||
int num = cuda_num_devices();
|
||||
int match = 0;
|
||||
for (int i=0; i < num; ++i)
|
||||
{
|
||||
cudaDeviceProp props;
|
||||
if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
|
||||
if (substringsearch(props.name, name, match)) return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
|
||||
typedef struct { double value[8]; } tsumarray;
|
||||
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
|
||||
|
Loading…
Reference in New Issue
Block a user