Browse Source

update to version v0.5 (2014-03-27)

master v0.5
Christian Buchner 11 years ago
parent
commit
2ca6ede92b
  1. 2
      Makefile.am
  2. 2
      Makefile.in
  3. 20
      README.txt
  4. 27
      ccminer.vcxproj
  5. 20
      configure
  6. 2
      configure.ac
  7. 50
      cpu-miner.c
  8. 4
      cpuminer-config.h
  9. 4
      cuda_blake512.cu
  10. 2
      cuda_combine.cu
  11. 9
      cuda_fugue256.cu
  12. 2
      cuda_groestl512.cu
  13. 260
      cuda_groestlcoin.cu
  14. 149
      cuda_hefty1.cu
  15. 2
      cuda_keccak512.cu
  16. 9
      cuda_sha256.cu
  17. 35
      heavy.cu

2
Makefile.am

@ -34,4 +34,4 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f @@ -34,4 +34,4 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
# we're now targeting all major compute architectures within one binary.
.cu.o:
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<

2
Makefile.in

@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS @@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS
# we're now targeting all major compute architectures within one binary.
.cu.o:
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.

20
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release
ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization"
-------------------------------------------------------------
***************************************************************
@ -38,6 +38,11 @@ its command line interface and options. @@ -38,6 +38,11 @@ its command line interface and options.
fugue256 use to mine Fuguecoin
groestl use to mine Groestlcoin
-d, --devices gives a comma separated list of CUDA device IDs
to operate on. Device IDs start counting from 0!
Alternatively give string names of your card like
gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ")
-O, --userpass=U:P username:password pair for mining server
-u, --user=USERNAME username for mining server
@ -63,8 +68,10 @@ its command line interface and options. @@ -63,8 +68,10 @@ its command line interface and options.
-V, --version display version information and exit
-h, --help display this help text and exit
>>> Examples <<<
Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
@ -107,6 +114,17 @@ from your old clunkers. @@ -107,6 +114,17 @@ from your old clunkers.
>>> RELEASE HISTORY <<<
March, 27 2014 Heavycoin exchange rates soar, and as a result this coin
gets some love: We greatly optimized the Hefty1 kernel
for speed. Expect some hefty gains, especially on 750Ti's!
By popular demand, we added the -d option as known from
cudaminer.
different compute capability builds are now provided until
we figure out how to pack everything into a single executable
in a Windows build.
March, 24 2014 fixed Groestl pool support
went back to Compute 1.x for cuda_hefty1.cu kernel by

27
ccminer.vcxproj

@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command> @@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile>
<CudaCompile>
<MaxRegCount>63</MaxRegCount>
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include>
</Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command> @@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile>
<CudaCompile>
<MaxRegCount>63</MaxRegCount>
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include>
</Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command> @@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile>
<CudaCompile>
<MaxRegCount>63</MaxRegCount>
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include>
</Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command> @@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile>
<CudaCompile>
<MaxRegCount>63</MaxRegCount>
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include>
</Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command> @@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CudaCompile Include="cuda_fugue256.cu" />
<CudaCompile Include="cuda_groestl512.cu" />
<CudaCompile Include="cuda_groestlcoin.cu" />
<CudaCompile Include="cuda_hefty1.cu">
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">124</MaxRegCount>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">124</MaxRegCount>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">124</MaxRegCount>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">124</MaxRegCount>
</CudaCompile>
<CudaCompile Include="cuda_hefty1.cu" />
<CudaCompile Include="cuda_keccak512.cu" />
<CudaCompile Include="cuda_sha256.cu" />
<CudaCompile Include="heavy.cu" />

20
configure vendored

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24.
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27.
#
#
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@ -557,8 +557,8 @@ MAKEFLAGS= @@ -557,8 +557,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='ccminer'
PACKAGE_TARNAME='ccminer'
PACKAGE_VERSION='2014.03.24'
PACKAGE_STRING='ccminer 2014.03.24'
PACKAGE_VERSION='2014.03.27'
PACKAGE_STRING='ccminer 2014.03.27'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then @@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems.
\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1368,7 +1368,7 @@ fi @@ -1368,7 +1368,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of ccminer 2014.03.24:";;
short | recursive ) echo "Configuration of ccminer 2014.03.27:";;
esac
cat <<\_ACEOF
@ -1469,7 +1469,7 @@ fi @@ -1469,7 +1469,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
ccminer configure 2014.03.24
ccminer configure 2014.03.27
generated by GNU Autoconf 2.68
Copyright (C) 2010 Free Software Foundation, Inc.
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF @@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by ccminer $as_me 2014.03.24, which was
It was created by ccminer $as_me 2014.03.27, which was
generated by GNU Autoconf 2.68. Invocation command line was
$ $0 $@
@ -2901,7 +2901,7 @@ fi @@ -2901,7 +2901,7 @@ fi
# Define the identity of the package.
PACKAGE='ccminer'
VERSION='2014.03.24'
VERSION='2014.03.27'
cat >>confdefs.h <<_ACEOF
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 @@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by ccminer $as_me 2014.03.24, which was
This file was extended by ccminer $as_me 2014.03.27, which was
generated by GNU Autoconf 2.68. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@ -7184,7 +7184,7 @@ _ACEOF @@ -7184,7 +7184,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
ccminer config.status 2014.03.24
ccminer config.status 2014.03.27
configured by $0, generated by GNU Autoconf 2.68,
with options \\"\$ac_cs_config\\"

2
configure.ac

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.03.24])
AC_INIT([ccminer], [2014.03.27])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

50
cpu-miner.c

@ -51,8 +51,13 @@ @@ -51,8 +51,13 @@
// from heavy.cu
#ifdef __cplusplus
extern "C"
{
#endif
int cuda_num_devices();
int cuda_finddevice(char *name);
#ifdef __cplusplus
}
#endif
#ifdef __linux /* Linux specific policy and affinity management */
@ -144,10 +149,11 @@ static int opt_scantime = 5; @@ -144,10 +149,11 @@ static int opt_scantime = 5;
static json_t *opt_config;
static const bool opt_time = true;
static sha256_algos opt_algo = ALGO_HEAVY;
static int opt_n_threads;
static int opt_n_threads = 0;
bool opt_trust_pool = false;
uint16_t opt_vote = 9999;
static int num_processors;
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
static char *rpc_url;
static char *rpc_userpass;
static char *rpc_user, *rpc_pass;
@ -185,7 +191,11 @@ Options:\n\ @@ -185,7 +191,11 @@ Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\
fugue256 Fuguecoin hash\n\
heavy Heavycoin hash\n\
-v, --vote=VOTE block reward vote\n\
-d, --devices takes a comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\
string names of your cards like gtx780ti or gt640#2\n\
(matching 2nd gt640 in the PC)\n\
-v, --vote=VOTE block reward vote (for HeavyCoin)\n\
-m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
@ -227,7 +237,7 @@ static char const short_options[] = @@ -227,7 +237,7 @@ static char const short_options[] =
#ifdef HAVE_SYSLOG_H
"S"
#endif
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:";
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
static struct option const options[] = {
{ "algo", 1, NULL, 'a' },
@ -259,6 +269,7 @@ static struct option const options[] = { @@ -259,6 +269,7 @@ static struct option const options[] = {
{ "user", 1, NULL, 'u' },
{ "userpass", 1, NULL, 'O' },
{ "version", 0, NULL, 'V' },
{ "devices", 1, NULL, 'd' },
{ 0, 0, 0, 0 }
};
@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg) @@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg)
case 'S':
use_syslog = true;
break;
case 'd': // CB
{
char * pch = strtok (arg,",");
opt_n_threads = 0;
while (pch != NULL) {
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
{
if (atoi(pch) < num_processors)
device_map[opt_n_threads++] = atoi(pch);
else {
applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
exit(1);
}
} else {
int device = cuda_finddevice(pch);
if (device >= 0 && device < num_processors)
device_map[opt_n_threads++] = device;
else {
applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
exit(1);
}
}
pch = strtok (NULL, ",");
}
}
break;
case 'V':
show_version_and_exit();
case 'h':
@ -1346,7 +1383,7 @@ static void signal_handler(int sig) @@ -1346,7 +1383,7 @@ static void signal_handler(int sig)
}
#endif
#define PROGRAM_VERSION "0.4"
#define PROGRAM_VERSION "0.5"
int main(int argc, char *argv[])
{
struct thr_info *thr;
@ -1370,6 +1407,9 @@ int main(int argc, char *argv[]) @@ -1370,6 +1407,9 @@ int main(int argc, char *argv[])
rpc_user = strdup("");
rpc_pass = strdup("");
pthread_mutex_init(&applog_lock, NULL);
num_processors = cuda_num_devices();
/* parse command line */
parse_cmdline(argc, argv);
@ -1385,7 +1425,6 @@ int main(int argc, char *argv[]) @@ -1385,7 +1425,6 @@ int main(int argc, char *argv[])
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
}
pthread_mutex_init(&applog_lock, NULL);
pthread_mutex_init(&stats_lock, NULL);
pthread_mutex_init(&g_work_lock, NULL);
pthread_mutex_init(&stratum.sock_lock, NULL);
@ -1416,7 +1455,6 @@ int main(int argc, char *argv[]) @@ -1416,7 +1455,6 @@ int main(int argc, char *argv[])
}
#endif
num_processors = cuda_num_devices();
if (num_processors == 0)
{
applog(LOG_ERR, "No CUDA devices found! terminating.");

4
cpuminer-config.h

@ -152,7 +152,7 @@ @@ -152,7 +152,7 @@
#define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.03.24"
#define PACKAGE_STRING "ccminer 2014.03.27"
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
@ -161,7 +161,7 @@ @@ -161,7 +161,7 @@
#undef PACKAGE_URL
/* Define to the version of this package. */
#define PACKAGE_VERSION "2014.03.24"
#define PACKAGE_VERSION "2014.03.27"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be

4
cuda_blake512.cu

@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata) @@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata)
__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
{
const int threadsperblock = 128;
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

2
cuda_combine.cu

@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h @@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

9
cuda_fugue256.cu

@ -9,7 +9,10 @@ @@ -9,7 +9,10 @@
#define USE_SHARED 1
// heavy.cu
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas @@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
void fugue256_cpu_init(int thr_id, int threads)
{
cudaSetDevice(thr_id);
cudaSetDevice(device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void * @@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 4 * 256 * sizeof(uint32_t);
#else

2
cuda_groestl512.cu

@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) @@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

260
cuda_groestlcoin.cu

@ -9,6 +9,10 @@ @@ -9,6 +9,10 @@
#define USE_SHARED 1
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
@ -20,13 +24,7 @@ typedef unsigned long long uint64_t; @@ -20,13 +24,7 @@ typedef unsigned long long uint64_t;
__constant__ uint32_t pTarget[8]; // Single GPU
extern uint32_t *d_resultNonce[8];
// globaler Speicher für unsere Ergebnisse
uint32_t *d_hashGROESTLCOINoutput[8];
__constant__ uint32_t groestlcoin_gpu_state[32];
__constant__ uint32_t groestlcoin_gpu_msg[32];
__constant__ uint32_t sha256coin_gpu_constantTable[64];
__constant__ uint32_t sha256coin_gpu_register[8];
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[]; @@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#else
// Kepler (Compute 3.5)
#define S(x, n) __funnelshift_r( x, x, n );
#endif
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[]; @@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[];
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
{
uint32_t t[32];
//#pragma unroll 14
for(int r=0;r<14;r++)
{
#pragma unroll 16
for(int k=0;k<16;k++)
switch(r)
{
a[(k*2)+0] ^= PC32up(k * 0x10, r);
//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
}
// RBTT
@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) @@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
}
}
__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
{
//#pragma unroll 14
for(int r=0;r<14;r++)
{
uint32_t t[32];
#pragma unroll 16
for(int k=0;k<16;k++)
switch(r)
{
a[(k*2)+0] ^= QC32up(k * 0x10, r);
a[(k*2)+1] ^= QC32dn(k * 0x10, r);
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
}
// RBTT
@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) @@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
}
}
#if USE_SHARED
__global__ void __launch_bounds__(256)
__global__ void /* __launch_bounds__(256) */
#else
__global__ void
#endif
groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
{
#if USE_SHARED
extern __shared__ char mixtabs[];
@ -204,72 +286,52 @@ __global__ void @@ -204,72 +286,52 @@ __global__ void
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
/////
///// Lieber groestl, mach, dass es abgeht!!!
/////
// GROESTL
uint32_t message[32];
uint32_t state[32];
uint32_t g[32];
#pragma unroll 32
for(int k=0;k<32;k++)
{
// TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem
// constant Memory zu lesen. Das ist Verschwendung von Bandbreite.
state[k] = groestlcoin_gpu_state[k];
message[k] = groestlcoin_gpu_msg[k];
}
for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
uint32_t nounce = startNounce + thread;
message[19] = SWAB32(nounce);
#pragma unroll 32
for(int u=0;u<32;u++)
g[u] = message[u] ^ state[u]; // TODO: state ist fast ueberall 0.
for(int u=0;u<32;u++) state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
groestlcoin_perm_P(g, mixtabs); // TODO: g[] entspricht fast genau message[]
groestlcoin_perm_Q(message, mixtabs); // kann man das ausnutzen?
groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else
groestlcoin_perm_P(g, NULL);
groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++)
{
// TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion
// was den Registerbedarf senken koennte?
state[u] ^= g[u] ^ message[u];
g[u] = state[u];
}
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
groestlcoin_perm_P(g, mixtabs);
groestlcoin_perm_P(message, mixtabs);
#else
groestlcoin_perm_P(g, NULL);
groestlcoin_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++)
state[u] ^= g[u];
for(int u=0;u<32;u++) state[u] ^= message[u];
////
//// 2. Runde groestl
////
#pragma unroll 16
for(int k=0;k<16;k++)
message[k] = state[k + 16];
#pragma unroll 32
for(int k=0;k<32;k++)
state[k] = groestlcoin_gpu_state[k];
#pragma unroll 16
for(int k=0;k<16;k++)
for(int k=0;k<16;k++) message[k] = state[k + 16];
#pragma unroll 14
for(int k=1;k<15;k++)
message[k+16] = 0;
message[16] = 0x80;
@ -277,73 +339,58 @@ __global__ void @@ -277,73 +339,58 @@ __global__ void
#pragma unroll 32
for(int u=0;u<32;u++)
g[u] = message[u] ^ state[u];
state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
groestlcoin_perm_P(g, mixtabs);
groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else
groestlcoin_perm_P(g, NULL);
groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++)
{
state[u] ^= g[u] ^ message[u];
g[u] = state[u];
}
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
groestlcoin_perm_P(g, mixtabs);
groestlcoin_perm_P(message, mixtabs);
#else
groestlcoin_perm_P(g, NULL);
groestlcoin_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++)
state[u] ^= g[u];
/*
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] = state[k+16];
*/
for(int u=0;u<32;u++) state[u] ^= message[u];
// kopiere Ergebnis
/*
#pragma unroll 16
for(int k=0;k<16;k++)
((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
*/
int i;
int i, position = -1;
bool rc = true;
#pragma unroll 8
for (i = 7; i >= 0; i--) {
if (state[i+16] > pTarget[i]) {
if(position < i) {
position = i;
rc = false;
break;
}
}
if (state[i+16] < pTarget[i]) {
if(position < i) {
position = i;
rc = true;
break;
}
}
}
if(rc == true)
{
if(resNounce[0] > nounce)
{
resNounce[0] = nounce;
/*
#pragma unroll 8
for(int k=0;k<8;k++)
((uint32_t*)outputHash)[k] = (hash[k]);
*/
}
}
}
}
@ -360,7 +407,7 @@ __global__ void @@ -360,7 +407,7 @@ __global__ void
// Setup-Funktionen
__host__ void groestlcoin_cpu_init(int thr_id, int threads)
{
cudaSetDevice(thr_id);
cudaSetDevice(device_map[thr_id]);
cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
// Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads) @@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
// setze register
// TODO: fast vollstaendige Vorbelegung mit Nullen.
// da besteht doch Optimierungspotenzial im GPU Kernel
// denn mit Nullen braucht man nicht wirklich rechnen.
uint32_t groestl_state_init[32];
memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
groestl_state_init[31] = 0x20000;
// state speichern
cudaMemcpyToSymbol( groestlcoin_gpu_state,
groestl_state_init,
128);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
// Speicher für alle Ergebnisse belegen (nur für Debug)
cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
}
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce @@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce @@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]);
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
/// Debug
//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
// Nounce
//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
}

149
cuda_hefty1.cu

@ -2,26 +2,40 @@ @@ -2,26 +2,40 @@
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// aus cpu-miner.c
extern int device_map[8];
#include <stdio.h>
#include <memory.h>
#define USE_SHARED 1
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props;
// globaler Speicher für alle HeftyHashes aller Threads
uint32_t *d_heftyHashes[8];
/* Hash-Tabellen */
__constant__ uint32_t hefty_gpu_constantTable[64];
#if USE_SHARED
#define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
#else
#define heftyLookUp(x) hefty_gpu_constantTable[x]
#endif
// muss expandiert werden
__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
__constant__ uint32_t hefty_gpu_register[8];
__constant__ uint32_t hefty_gpu_sponge[4];
uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
uint32_t hefty_cpu_hashTable[] = {
0x6a09e667UL,
0xbb67ae85UL,
0x3c6ef372UL,
0xa54ff53aUL,
@ -29,6 +43,7 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL, @@ -29,6 +43,7 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
0x9b05688cUL,
0x1f83d9abUL,
0x5be0cd19UL };
uint32_t hefty_cpu_constantTable[] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
@ -48,7 +63,11 @@ uint32_t hefty_cpu_constantTable[] = { @@ -48,7 +63,11 @@ uint32_t hefty_cpu_constantTable[] = {
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
};
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
static __host__ __device__ uint32_t S(uint32_t x, int n)
{
return (((x) >> (n)) | ((x) << (32 - (n))));
}
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
@ -65,36 +84,50 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x) @@ -65,36 +84,50 @@ __host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
{
uint16_t w = (x >> 16) ^ (x & 0xffff);
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
return (n >> 2) ^ (n & 0x03);
return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
}
// 4 auf einmal
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
__host__ __device__ void Mangle(uint32_t *inp)
__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
{
uint32_t r = smoosh4Quad(inp[0]);
//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) );
//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) );
//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) );
//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) );
uint32_t inp0org;
uint32_t tmp0Mask, tmp1Mask;
uint32_t in1, in2, isAddition;
uint32_t tmp;
uint8_t b;
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
switch (smoosh2(inp[1])) {
case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break;
case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break;
case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break;
case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break;
}
uint32_t tmp = smoosh2(inp[1] ^ inp[2]);
switch (tmp) {
case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break;
case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break;
case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break;
case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break;
}
r += 0x01010101;
tmp = smoosh2(inp[1]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
in1 = (inp[2] & ~inp0org) |
(tmp1Mask & ~inp[2] & inp0org) |
(~tmp0Mask & ~inp[2] & inp0org);
in2 = inp[2] += ~inp0org;
isAddition = ~tmp0Mask & tmp1Mask;
inp[2] = isAddition ? in2 : in1;
r += 0x01010101;
tmp = smoosh2(inp[1] ^ inp[2]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
in1 = (inp[3] & ~inp0org) |
(tmp1Mask & ~inp[3] & inp0org) |
(~tmp0Mask & ~inp[3] & inp0org);
in2 = inp[3] += ~inp0org;
isAddition = ~tmp0Mask & tmp1Mask;
inp[3] = isAddition ? in2 : in1;
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
}
@ -115,38 +148,15 @@ __host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp) @@ -115,38 +148,15 @@ __host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
{
uint32_t r = Squeeze(sponge);
uint32_t t = ((r >> 8) & 0x1F);
uint32_t y = 1 << t;
//uint8_t r0 = r >> 8;
uint8_t r1 = r & 0xFF;
uint32_t y = 1 << ((r >> 8) & 0x1F);
//uint32_t retVal;
//retVal = x;
uint32_t resArr[4];
resArr[0] = x;
resArr[1] = x & ~y;
resArr[2] = x | y;
resArr[3] = x ^ y;
return resArr[r1 & 0x03];
uint32_t a = (((r>>1) & 0x01) << t) & y;
uint32_t b = ((r & 0x01) << t) & y;
uint32_t c = x & y;
/*
switch(r1 & 0x03)
{
case 0:
break;
case 1:
retVal = x & ~y;
break;
case 2:
retVal = x | y;
break;
case 3:
retVal = x ^ y;
break;
}
uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
return retVal;
*/
}
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
@ -197,6 +207,16 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t * @@ -197,6 +207,16 @@ __host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *
// Die Hash-Funktion
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
{
#if USE_SHARED
extern __shared__ char heftytab[];
if(threadIdx.x < 64)
{
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
@ -204,14 +224,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa @@ -204,14 +224,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
uint32_t nounce = startNounce + thread;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
#if USE_SHARED
extern __shared__ unsigned char s[];
uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
#else
// reduktion von 256 byte auf 128 byte
uint32_t W1[16];
uint32_t W2[16];
#endif
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
@ -236,18 +251,17 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa @@ -236,18 +251,17 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
W1[k] = hefty_gpu_blockHeader[k];
W1[3] = SWAB32(nounce);
// 2. Runde
#pragma unroll 16
for(int j=0;j<16;j++)
Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]);
Absorb(sponge, W1[j] ^ heftyLookUp(j));
// Progress W1 (Bytes 0...63)
#pragma unroll 16
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] ^ regs[7]);
hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge);
hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
}
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
@ -272,14 +286,13 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa @@ -272,14 +286,13 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] + regs[7]);
hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge);
hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
}
#pragma unroll 16
for(int j=0;j<16;j++)
W1[j] = W2[j];
}
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
@ -293,7 +306,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa @@ -293,7 +306,9 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
// Setup-Funktionen
__host__ void hefty_cpu_init(int thr_id, int threads)
{
cudaSetDevice(thr_id);
cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props, device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( hefty_gpu_constantTable,
@ -378,15 +393,17 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data) @@ -378,15 +393,17 @@ __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{
const int threadsperblock = 128;
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
#if USE_SHARED
size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 64 * sizeof(uint32_t);
#else
size_t shared_size = 0;
#endif

2
cuda_keccak512.cu

@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) @@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

9
cuda_sha256.cu

@ -5,8 +5,6 @@ @@ -5,8 +5,6 @@
#include <stdio.h>
#include <memory.h>
#define W_ALIGNMENT 65
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH @@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
nonceVector[thread] = nounce;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
//extern __shared__ unsigned char s[];
//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
uint32_t W1[16];
uint32_t W2[16];
@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe @@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
{
const int threadsperblock = 128;
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

35
heavy.cu

@ -163,6 +163,41 @@ extern "C" int cuda_num_devices() @@ -163,6 +163,41 @@ extern "C" int cuda_num_devices()
return GPU_N;
}
static bool substringsearch(const char *haystack, const char *needle, int &match)
{
int hlen = strlen(haystack);
int nlen = strlen(needle);
for (int i=0; i < hlen; ++i)
{
if (haystack[i] == ' ') continue;
int j=0, x = 0;
while(j < nlen)
{
if (haystack[i+x] == ' ') {++x; continue;}
if (needle[j] == ' ') {++j; continue;}
if (needle[j] == '#') return ++match == needle[j+1]-'0';
if (tolower(haystack[i+x]) != tolower(needle[j])) break;
++j; ++x;
}
if (j == nlen) return true;
}
return false;
}
// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
extern "C" int cuda_finddevice(char *name)
{
int num = cuda_num_devices();
int match = 0;
for (int i=0; i < num; ++i)
{
cudaDeviceProp props;
if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
if (substringsearch(props.name, name, match)) return i;
}
return -1;
}
// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
typedef struct { double value[8]; } tsumarray;
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)

Loading…
Cancel
Save