update to version v0.5 (2014-03-27)

This commit is contained in:
Christian Buchner 2014-03-27 00:47:12 +01:00
parent 1bb78f0258
commit 2ca6ede92b
17 changed files with 566 additions and 443 deletions

View File

@ -34,4 +34,4 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
# we're now targeting all major compute architectures within one binary.
.cu.o:
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<

View File

@ -1035,7 +1035,7 @@ uninstall-am: uninstall-binPROGRAMS
# we're now targeting all major compute architectures within one binary.
.cu.o:
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_10,code=\"sm_10,compute_10\" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=63 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
$(NVCC) @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_20,code=\"sm_20,compute_20\" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.

View File

@ -1,5 +1,5 @@
ccMiner release 0.4 (Mar 24th 2014) - Groestlcoin Pool Release
ccMiner release 0.5 (Mar 27th 2014) - "Hefty Optimization"
-------------------------------------------------------------
***************************************************************
@ -38,6 +38,11 @@ its command line interface and options.
fugue256 use to mine Fuguecoin
groestl use to mine Groestlcoin
-d, --devices gives a comma separated list of CUDA device IDs
to operate on. Device IDs start counting from 0!
Alternatively give string names of your card like
gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ")
-O, --userpass=U:P username:password pair for mining server
-u, --user=USERNAME username for mining server
@ -63,8 +68,10 @@ its command line interface and options.
-V, --version display version information and exit
-h, --help display this help text and exit
>>> Examples <<<
Example for Heavycoin Mining on heavycoinpool.com with a single gpu in your system
ccminer.exe -t 1 -a heavy -o stratum+tcp://stratum01.heavycoinpool.com:5333 -u <<username.worker>> -p <<workerpassword>> -v 512
@ -107,22 +114,33 @@ from your old clunkers.
>>> RELEASE HISTORY <<<
March, 24 2014 fixed Groestl pool support
March, 27 2014 Heavycoin exchange rates soar, and as a result this coin
gets some love: We greatly optimized the Hefty1 kernel
for speed. Expect some hefty gains, especially on 750Ti's!
went back to Compute 1.x for cuda_hefty1.cu kernel by
default after numerous reports of ccminer v0.2/v0.3
not working with HeavyCoin for some people.
By popular demand, we added the -d option as known from
cudaminer.
March, 23 2014 added Groestlcoin support. stratum status unknown
(the only pool is currently down for fixing issues)
different compute capability builds are now provided until
we figure out how to pack everything into a single executable
in a Windows build.
March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
on Fermi and Maxwell devices. Kepler may suffer slightly
(3-5%)
March, 24 2014 fixed Groestl pool support
Fixed Stratum for Fuguecoin. Tested on dwarfpool.
went back to Compute 1.x for cuda_hefty1.cu kernel by
default after numerous reports of ccminer v0.2/v0.3
not working with HeavyCoin for some people.
March, 18 2014 initial release.
March, 23 2014 added Groestlcoin support. stratum status unknown
(the only pool is currently down for fixing issues)
March, 21 2014 use of shared memory in Fugue256 kernel boosts hash rates
on Fermi and Maxwell devices. Kepler may suffer slightly
(3-5%)
Fixed Stratum for Fuguecoin. Tested on dwarfpool.
March, 18 2014 initial release.
>>> AUTHORS <<<

View File

@ -95,12 +95,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile>
<CudaCompile>
<MaxRegCount>63</MaxRegCount>
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include>
</Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -127,12 +127,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile>
<CudaCompile>
<MaxRegCount>63</MaxRegCount>
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include>
</Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -163,12 +163,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile>
<CudaCompile>
<MaxRegCount>63</MaxRegCount>
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include>
</Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -199,12 +199,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CInterleavedPTX>true</CInterleavedPTX>
</CudaCompile>
<CudaCompile>
<MaxRegCount>63</MaxRegCount>
<MaxRegCount>80</MaxRegCount>
</CudaCompile>
<CudaCompile>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_20,sm_20</CodeGeneration>
<CodeGeneration>compute_35,sm_35</CodeGeneration>
<Include>
</Include>
<AdditionalOptions>-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -277,16 +277,7 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<CudaCompile Include="cuda_fugue256.cu" />
<CudaCompile Include="cuda_groestl512.cu" />
<CudaCompile Include="cuda_groestlcoin.cu" />
<CudaCompile Include="cuda_hefty1.cu">
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">compute_10,sm_10</CodeGeneration>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">124</MaxRegCount>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">124</MaxRegCount>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">124</MaxRegCount>
<CodeGeneration Condition="'$(Configuration)|$(Platform)'=='Release|x64'">compute_10,sm_10</CodeGeneration>
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">124</MaxRegCount>
</CudaCompile>
<CudaCompile Include="cuda_hefty1.cu" />
<CudaCompile Include="cuda_keccak512.cu" />
<CudaCompile Include="cuda_sha256.cu" />
<CudaCompile Include="heavy.cu" />

20
configure vendored
View File

@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.24.
# Generated by GNU Autoconf 2.68 for ccminer 2014.03.27.
#
#
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@ -557,8 +557,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='ccminer'
PACKAGE_TARNAME='ccminer'
PACKAGE_VERSION='2014.03.24'
PACKAGE_STRING='ccminer 2014.03.24'
PACKAGE_VERSION='2014.03.27'
PACKAGE_STRING='ccminer 2014.03.27'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@ -1297,7 +1297,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures ccminer 2014.03.24 to adapt to many kinds of systems.
\`configure' configures ccminer 2014.03.27 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1368,7 +1368,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of ccminer 2014.03.24:";;
short | recursive ) echo "Configuration of ccminer 2014.03.27:";;
esac
cat <<\_ACEOF
@ -1469,7 +1469,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
ccminer configure 2014.03.24
ccminer configure 2014.03.27
generated by GNU Autoconf 2.68
Copyright (C) 2010 Free Software Foundation, Inc.
@ -1972,7 +1972,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by ccminer $as_me 2014.03.24, which was
It was created by ccminer $as_me 2014.03.27, which was
generated by GNU Autoconf 2.68. Invocation command line was
$ $0 $@
@ -2901,7 +2901,7 @@ fi
# Define the identity of the package.
PACKAGE='ccminer'
VERSION='2014.03.24'
VERSION='2014.03.27'
cat >>confdefs.h <<_ACEOF
@ -7118,7 +7118,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by ccminer $as_me 2014.03.24, which was
This file was extended by ccminer $as_me 2014.03.27, which was
generated by GNU Autoconf 2.68. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@ -7184,7 +7184,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
ccminer config.status 2014.03.24
ccminer config.status 2014.03.27
configured by $0, generated by GNU Autoconf 2.68,
with options \\"\$ac_cs_config\\"

View File

@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.03.24])
AC_INIT([ccminer], [2014.03.27])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

View File

@ -51,8 +51,13 @@
// from heavy.cu
#ifdef __cplusplus
extern "C"
{
#endif
int cuda_num_devices();
int cuda_finddevice(char *name);
#ifdef __cplusplus
}
#endif
#ifdef __linux /* Linux specific policy and affinity management */
@ -144,10 +149,11 @@ static int opt_scantime = 5;
static json_t *opt_config;
static const bool opt_time = true;
static sha256_algos opt_algo = ALGO_HEAVY;
static int opt_n_threads;
static int opt_n_threads = 0;
bool opt_trust_pool = false;
uint16_t opt_vote = 9999;
static int num_processors;
int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
static char *rpc_url;
static char *rpc_userpass;
static char *rpc_user, *rpc_pass;
@ -185,7 +191,11 @@ Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\
fugue256 Fuguecoin hash\n\
heavy Heavycoin hash\n\
-v, --vote=VOTE block reward vote\n\
-d, --devices takes a comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\
string names of your cards like gtx780ti or gt640#2\n\
(matching 2nd gt640 in the PC)\n\
-v, --vote=VOTE block reward vote (for HeavyCoin)\n\
-m, --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
@ -227,7 +237,7 @@ static char const short_options[] =
#ifdef HAVE_SYSLOG_H
"S"
#endif
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vmv:";
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:";
static struct option const options[] = {
{ "algo", 1, NULL, 'a' },
@ -259,6 +269,7 @@ static struct option const options[] = {
{ "user", 1, NULL, 'u' },
{ "userpass", 1, NULL, 'O' },
{ "version", 0, NULL, 'V' },
{ "devices", 1, NULL, 'd' },
{ 0, 0, 0, 0 }
};
@ -1251,6 +1262,32 @@ static void parse_arg (int key, char *arg)
case 'S':
use_syslog = true;
break;
case 'd': // CB
{
char * pch = strtok (arg,",");
opt_n_threads = 0;
while (pch != NULL) {
if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
{
if (atoi(pch) < num_processors)
device_map[opt_n_threads++] = atoi(pch);
else {
applog(LOG_ERR, "Non-existant CUDA device #%d specified in -d option", atoi(pch));
exit(1);
}
} else {
int device = cuda_finddevice(pch);
if (device >= 0 && device < num_processors)
device_map[opt_n_threads++] = device;
else {
applog(LOG_ERR, "Non-existant CUDA device '%s' specified in -d option", pch);
exit(1);
}
}
pch = strtok (NULL, ",");
}
}
break;
case 'V':
show_version_and_exit();
case 'h':
@ -1346,7 +1383,7 @@ static void signal_handler(int sig)
}
#endif
#define PROGRAM_VERSION "0.4"
#define PROGRAM_VERSION "0.5"
int main(int argc, char *argv[])
{
struct thr_info *thr;
@ -1370,6 +1407,9 @@ int main(int argc, char *argv[])
rpc_user = strdup("");
rpc_pass = strdup("");
pthread_mutex_init(&applog_lock, NULL);
num_processors = cuda_num_devices();
/* parse command line */
parse_cmdline(argc, argv);
@ -1385,7 +1425,6 @@ int main(int argc, char *argv[])
sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass);
}
pthread_mutex_init(&applog_lock, NULL);
pthread_mutex_init(&stats_lock, NULL);
pthread_mutex_init(&g_work_lock, NULL);
pthread_mutex_init(&stratum.sock_lock, NULL);
@ -1416,7 +1455,6 @@ int main(int argc, char *argv[])
}
#endif
num_processors = cuda_num_devices();
if (num_processors == 0)
{
applog(LOG_ERR, "No CUDA devices found! terminating.");

View File

@ -152,7 +152,7 @@
#define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.03.24"
#define PACKAGE_STRING "ccminer 2014.03.27"
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
#undef PACKAGE_URL
/* Define to the version of this package. */
#define PACKAGE_VERSION "2014.03.24"
#define PACKAGE_VERSION "2014.03.27"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be

View File

@ -292,13 +292,13 @@ __host__ void blake512_cpu_setBlock(void *pdata)
__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
{
const int threadsperblock = 128;
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -138,7 +138,7 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -9,7 +9,10 @@
#define USE_SHARED 1
// heavy.cu
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
@ -732,7 +735,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
void fugue256_cpu_init(int thr_id, int threads)
{
cudaSetDevice(thr_id);
cudaSetDevice(device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
@ -774,7 +777,7 @@ __host__ void fugue256_cpu_hash(int thr_id, int threads, int startNounce, void *
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 4 * 256 * sizeof(uint32_t);
#else

View File

@ -813,7 +813,7 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -9,6 +9,10 @@
#define USE_SHARED 1
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen
@ -20,13 +24,7 @@ typedef unsigned long long uint64_t;
__constant__ uint32_t pTarget[8]; // Single GPU
extern uint32_t *d_resultNonce[8];
// globaler Speicher für unsere Ergebnisse
uint32_t *d_hashGROESTLCOINoutput[8];
__constant__ uint32_t groestlcoin_gpu_state[32];
__constant__ uint32_t groestlcoin_gpu_msg[32];
__constant__ uint32_t sha256coin_gpu_constantTable[64];
__constant__ uint32_t sha256coin_gpu_register[8];
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
@ -83,7 +81,13 @@ extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#else
// Kepler (Compute 3.5)
#define S(x, n) __funnelshift_r( x, x, n );
#endif
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
@ -95,18 +99,57 @@ extern uint32_t T3dn_cpu[];
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
__device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
{
uint32_t t[32];
//#pragma unroll 14
for(int r=0;r<14;r++)
{
#pragma unroll 16
for(int k=0;k<16;k++)
switch(r)
{
a[(k*2)+0] ^= PC32up(k * 0x10, r);
//a[(k<<1)+1] ^= PC32dn(k * 0x10, r);
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
}
// RBTT
@ -137,18 +180,57 @@ __device__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
}
}
__device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
{
//#pragma unroll 14
for(int r=0;r<14;r++)
{
uint32_t t[32];
#pragma unroll 16
for(int k=0;k<16;k++)
switch(r)
{
a[(k*2)+0] ^= QC32up(k * 0x10, r);
a[(k*2)+1] ^= QC32dn(k * 0x10, r);
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
}
// RBTT
@ -179,12 +261,12 @@ __device__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
}
}
#if USE_SHARED
__global__ void __launch_bounds__(256)
__global__ void /* __launch_bounds__(256) */
#else
__global__ void
#endif
groestlcoin_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
{
#if USE_SHARED
extern __shared__ char mixtabs[];
@ -204,146 +286,111 @@ __global__ void
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
/////
///// Lieber groestl, mach, dass es abgeht!!!
/////
// GROESTL
uint32_t message[32];
uint32_t state[32];
uint32_t g[32];
#pragma unroll 32
for(int k=0;k<32;k++)
{
// TODO: die Vorbelegung mit Nullen braucht nicht zwingend aus dem
// constant Memory zu lesen. Das ist Verschwendung von Bandbreite.
state[k] = groestlcoin_gpu_state[k];
message[k] = groestlcoin_gpu_msg[k];
}
for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
uint32_t nounce = startNounce + thread;
message[19] = SWAB32(nounce);
#pragma unroll 32
for(int u=0;u<32;u++)
g[u] = message[u] ^ state[u]; // TODO: state ist fast ueberall 0.
for(int u=0;u<32;u++) state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
groestlcoin_perm_P(g, mixtabs); // TODO: g[] entspricht fast genau message[]
groestlcoin_perm_Q(message, mixtabs); // kann man das ausnutzen?
groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else
groestlcoin_perm_P(g, NULL);
groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++)
{
// TODO: kann man evtl. das xor mit g[u] vorziehen hinter die groestlcoin_perm_P Funktion
// was den Registerbedarf senken koennte?
state[u] ^= g[u] ^ message[u];
g[u] = state[u];
}
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
groestlcoin_perm_P(g, mixtabs);
groestlcoin_perm_P(message, mixtabs);
#else
groestlcoin_perm_P(g, NULL);
groestlcoin_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++)
state[u] ^= g[u];
for(int u=0;u<32;u++) state[u] ^= message[u];
////
//// 2. Runde groestl
////
#pragma unroll 16
for(int k=0;k<16;k++)
message[k] = state[k + 16];
#pragma unroll 32
for(int k=0;k<32;k++)
state[k] = groestlcoin_gpu_state[k];
#pragma unroll 16
for(int k=0;k<16;k++)
for(int k=0;k<16;k++) message[k] = state[k + 16];
#pragma unroll 14
for(int k=1;k<15;k++)
message[k+16] = 0;
message[16] = 0x80;
message[16] = 0x80;
message[31] = 0x01000000;
#pragma unroll 32
for(int u=0;u<32;u++)
g[u] = message[u] ^ state[u];
state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
groestlcoin_perm_P(g, mixtabs);
groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else
groestlcoin_perm_P(g, NULL);
groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++)
{
state[u] ^= g[u] ^ message[u];
g[u] = state[u];
}
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
groestlcoin_perm_P(g, mixtabs);
groestlcoin_perm_P(message, mixtabs);
#else
groestlcoin_perm_P(g, NULL);
groestlcoin_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++)
state[u] ^= g[u];
/*
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] = state[k+16];
*/
for(int u=0;u<32;u++) state[u] ^= message[u];
// kopiere Ergebnis
/*
#pragma unroll 16
for(int k=0;k<16;k++)
((uint32_t*)outputHash)[16*thread+k] = state[k + 16];
*/
int i;
int i, position = -1;
bool rc = true;
#pragma unroll 8
for (i = 7; i >= 0; i--) {
if (state[i+16] > pTarget[i]) {
rc = false;
break;
}
if (state[i+16] < pTarget[i]) {
rc = true;
break;
}
if(position < i) {
position = i;
rc = false;
}
}
if (state[i+16] < pTarget[i]) {
if(position < i) {
position = i;
rc = true;
}
}
}
if(rc == true)
{
if(resNounce[0] > nounce)
{
resNounce[0] = nounce;
/*
#pragma unroll 8
for(int k=0;k<8;k++)
((uint32_t*)outputHash)[k] = (hash[k]);
*/
}
}
}
}
@ -360,7 +407,7 @@ __global__ void
// Setup-Funktionen
__host__ void groestlcoin_cpu_init(int thr_id, int threads)
{
cudaSetDevice(thr_id);
cudaSetDevice(device_map[thr_id]);
cudaDeviceSetCacheConfig( cudaFuncCachePreferShared );
// Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
@ -372,23 +419,8 @@ __host__ void groestlcoin_cpu_init(int thr_id, int threads)
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
// setze register
// TODO: fast vollstaendige Vorbelegung mit Nullen.
// da besteht doch Optimierungspotenzial im GPU Kernel
// denn mit Nullen braucht man nicht wirklich rechnen.
uint32_t groestl_state_init[32];
memset(groestl_state_init, 0, sizeof(uint32_t) * 32);
groestl_state_init[31] = 0x20000;
// state speichern
cudaMemcpyToSymbol( groestlcoin_gpu_state,
groestl_state_init,
128);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
// Speicher für alle Ergebnisse belegen (nur für Debug)
cudaMalloc(&d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads);
}
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -430,7 +462,7 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
@ -440,16 +472,10 @@ __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashGROESTLCOINoutput[thr_id], d_resultNonce[thr_id]);
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
/// Debug
//cudaMemcpy(outputHashes, d_hashGROESTLCOINoutput[thr_id], 8 * sizeof(uint32_t) * threads, cudaMemcpyDeviceToHost);
// Nounce
//cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
}

View File

@ -2,26 +2,40 @@
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// aus cpu-miner.c
extern int device_map[8];
#include <stdio.h>
#include <memory.h>
#define USE_SHARED 1
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props;
// globaler Speicher für alle HeftyHashes aller Threads
uint32_t *d_heftyHashes[8];
/* Hash-Tabellen */
__constant__ uint32_t hefty_gpu_constantTable[64];
#if USE_SHARED
#define heftyLookUp(x) (*((uint32_t*)heftytab + (x)))
#else
#define heftyLookUp(x) hefty_gpu_constantTable[x]
#endif
// muss expandiert werden
__constant__ uint32_t hefty_gpu_blockHeader[16]; // 2x512 Bit Message
__constant__ uint32_t hefty_gpu_register[8];
__constant__ uint32_t hefty_gpu_sponge[4];
uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
uint32_t hefty_cpu_hashTable[] = {
0x6a09e667UL,
0xbb67ae85UL,
0x3c6ef372UL,
0xa54ff53aUL,
@ -29,8 +43,9 @@ uint32_t hefty_cpu_hashTable[] = { 0x6a09e667UL,
0x9b05688cUL,
0x1f83d9abUL,
0x5be0cd19UL };
uint32_t hefty_cpu_constantTable[] = {
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
@ -48,350 +63,352 @@ uint32_t hefty_cpu_constantTable[] = {
0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
};
#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
//#define S(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
static __host__ __device__ uint32_t S(uint32_t x, int n)
{
return (((x) >> (n)) | ((x) << (32 - (n))));
}
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define S0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
#define S1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
#define s0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
#define s1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
// uint8_t
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
#define smoosh4(x) ( ((x)>>4) ^ ((x) & 0x0F) )
__host__ __forceinline__ __device__ uint8_t smoosh2(uint32_t x)
{
uint16_t w = (x >> 16) ^ (x & 0xffff);
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
return (n >> 2) ^ (n & 0x03);
uint16_t w = (x >> 16) ^ (x & 0xffff);
uint8_t n = smoosh4( (uint8_t)( (w >> 8) ^ (w & 0xFF) ) );
return 24 - (((n >> 2) ^ (n & 0x03)) << 3);
}
// 4 auf einmal
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
#define smoosh4Quad(x) ( (((x)>>4) ^ (x)) & 0x0F0F0F0F )
#define getByte(x,y) ( ((x) >> (y)) & 0xFF )
__host__ __device__ void Mangle(uint32_t *inp)
__host__ __forceinline__ __device__ void Mangle(uint32_t *inp)
{
uint32_t r = smoosh4Quad(inp[0]);
//uint8_t r0 = smoosh4( (uint8_t)(inp[0] >> 24) );
//uint8_t r1 = smoosh4( (uint8_t)(inp[0] >> 16) );
//uint8_t r2 = smoosh4( (uint8_t)(inp[0] >> 8) );
//uint8_t r3 = smoosh4( (uint8_t)(inp[0] & 0xFF) );
uint32_t r = smoosh4Quad(inp[0]);
uint32_t inp0org;
uint32_t tmp0Mask, tmp1Mask;
uint32_t in1, in2, isAddition;
uint32_t tmp;
uint8_t b;
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
switch (smoosh2(inp[1])) {
case 0: inp[2] ^= S(inp[0], 1 + getByte(r,24)); break;
case 1: inp[2] += S(~inp[0], 1 + getByte(r,16)); break;
case 2: inp[2] &= S(~inp[0], 1 + getByte(r,8)); break;
case 3: inp[2] ^= S(inp[0], 1 + getByte(r,0)); break;
}
inp[1] = inp[1] ^ S(inp[0], getByte(r, 24));
r += 0x01010101;
tmp = smoosh2(inp[1]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
uint32_t tmp = smoosh2(inp[1] ^ inp[2]);
switch (tmp) {
case 0: inp[3] ^= S(inp[0], 2 + getByte(r,24)); break;
case 1: inp[3] += S(~inp[0], 2 + getByte(r,16)); break;
case 2: inp[3] &= S(~inp[0], 2 + getByte(r,8)); break;
case 3: inp[3] ^= S(inp[0], 2 + getByte(r,0)); break;
}
in1 = (inp[2] & ~inp0org) |
(tmp1Mask & ~inp[2] & inp0org) |
(~tmp0Mask & ~inp[2] & inp0org);
in2 = inp[2] += ~inp0org;
isAddition = ~tmp0Mask & tmp1Mask;
inp[2] = isAddition ? in2 : in1;
r += 0x01010101;
tmp = smoosh2(inp[1] ^ inp[2]);
b = getByte(r,tmp);
inp0org = S(inp[0], b);
tmp0Mask = -((tmp >> 3)&1); // Bit 3 an Position 0
tmp1Mask = -((tmp >> 4)&1); // Bit 4 an Position 0
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
in1 = (inp[3] & ~inp0org) |
(tmp1Mask & ~inp[3] & inp0org) |
(~tmp0Mask & ~inp[3] & inp0org);
in2 = inp[3] += ~inp0org;
isAddition = ~tmp0Mask & tmp1Mask;
inp[3] = isAddition ? in2 : in1;
inp[0] ^= (inp[1] ^ inp[2]) + inp[3];
}
__host__ __forceinline__ __device__ void Absorb(uint32_t *inp, uint32_t x)
{
inp[0] ^= x;
Mangle(inp);
inp[0] ^= x;
Mangle(inp);
}
__host__ __forceinline__ __device__ uint32_t Squeeze(uint32_t *inp)
{
uint32_t y = inp[0];
Mangle(inp);
return y;
uint32_t y = inp[0];
Mangle(inp);
return y;
}
__host__ __forceinline__ __device__ uint32_t Br(uint32_t *sponge, uint32_t x)
{
uint32_t r = Squeeze(sponge);
uint32_t r = Squeeze(sponge);
uint32_t t = ((r >> 8) & 0x1F);
uint32_t y = 1 << t;
//uint8_t r0 = r >> 8;
uint8_t r1 = r & 0xFF;
uint32_t y = 1 << ((r >> 8) & 0x1F);
uint32_t a = (((r>>1) & 0x01) << t) & y;
uint32_t b = ((r & 0x01) << t) & y;
uint32_t c = x & y;
//uint32_t retVal;
//retVal = x;
uint32_t resArr[4];
resArr[0] = x;
resArr[1] = x & ~y;
resArr[2] = x | y;
resArr[3] = x ^ y;
return resArr[r1 & 0x03];
/*
switch(r1 & 0x03)
{
case 0:
break;
case 1:
retVal = x & ~y;
break;
case 2:
retVal = x | y;
break;
case 3:
retVal = x ^ y;
break;
}
return retVal;
*/
uint32_t retVal = (x & ~y) | (~b & c) | (a & ~c);
return retVal;
}
__forceinline__ __device__ void hefty_gpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{
uint32_t tmpBr;
uint32_t tmpBr;
uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]);
uint32_t tmp2 = tmp1 + S1(brE);
uint32_t brC = Br(sponge, regs[2]);
uint32_t brB = Br(sponge, regs[1]);
uint32_t brA = Br(sponge, regs[0]);
uint32_t tmp3 = Maj(brA, brB, brC);
tmpBr = Br(sponge, regs[0]);
uint32_t tmp4 = tmp3 + S0(tmpBr);
tmpBr = Br(sponge, tmp2);
uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]);
uint32_t tmp2 = tmp1 + S1(brE);
uint32_t brC = Br(sponge, regs[2]);
uint32_t brB = Br(sponge, regs[1]);
uint32_t brA = Br(sponge, regs[0]);
uint32_t tmp3 = Maj(brA, brB, brC);
tmpBr = Br(sponge, regs[0]);
uint32_t tmp4 = tmp3 + S0(tmpBr);
tmpBr = Br(sponge, tmp2);
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = tmp2 + tmp4;
regs[4] += tmpBr;
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = tmp2 + tmp4;
regs[4] += tmpBr;
}
__host__ void hefty_cpu_round(uint32_t *regs, uint32_t W, uint32_t K, uint32_t *sponge)
{
uint32_t tmpBr;
uint32_t tmpBr;
uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]);
uint32_t tmp2 = tmp1 + S1(brE);
uint32_t brC = Br(sponge, regs[2]);
uint32_t brB = Br(sponge, regs[1]);
uint32_t brA = Br(sponge, regs[0]);
uint32_t tmp3 = Maj(brA, brB, brC);
tmpBr = Br(sponge, regs[0]);
uint32_t tmp4 = tmp3 + S0(tmpBr);
tmpBr = Br(sponge, tmp2);
uint32_t brG = Br(sponge, regs[6]);
uint32_t brF = Br(sponge, regs[5]);
uint32_t tmp1 = Ch(regs[4], brF, brG) + regs[7] + W + K;
uint32_t brE = Br(sponge, regs[4]);
uint32_t tmp2 = tmp1 + S1(brE);
uint32_t brC = Br(sponge, regs[2]);
uint32_t brB = Br(sponge, regs[1]);
uint32_t brA = Br(sponge, regs[0]);
uint32_t tmp3 = Maj(brA, brB, brC);
tmpBr = Br(sponge, regs[0]);
uint32_t tmp4 = tmp3 + S0(tmpBr);
tmpBr = Br(sponge, tmp2);
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = tmp2 + tmp4;
regs[4] += tmpBr;
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = tmp2 + tmp4;
regs[4] += tmpBr;
}
// Die Hash-Funktion
__global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
#if USE_SHARED
extern __shared__ unsigned char s[];
uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
#else
// reduktion von 256 byte auf 128 byte
uint32_t W1[16];
uint32_t W2[16];
#if USE_SHARED
extern __shared__ char heftytab[];
if(threadIdx.x < 64)
{
*((uint32_t*)heftytab + threadIdx.x) = hefty_gpu_constantTable[threadIdx.x];
}
__syncthreads();
#endif
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
uint32_t sponge[4];
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
// reduktion von 256 byte auf 128 byte
uint32_t W1[16];
uint32_t W2[16];
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
uint32_t sponge[4];
#pragma unroll 4
for(int k=0; k < 4; k++)
sponge[k] = hefty_gpu_sponge[k];
for(int k=0; k < 4; k++)
sponge[k] = hefty_gpu_sponge[k];
// pre
// pre
#pragma unroll 8
for (int k=0; k < 8; k++)
{
regs[k] = hefty_gpu_register[k];
hash[k] = regs[k];
}
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
for (int k=0; k < 8; k++)
{
regs[k] = hefty_gpu_register[k];
hash[k] = regs[k];
}
//memcpy(W, &hefty_gpu_blockHeader[0], sizeof(uint32_t) * 16); // verbleibende 20 bytes aus Block 2 plus padding
#pragma unroll 16
for(int k=0;k<16;k++)
W1[k] = hefty_gpu_blockHeader[k];
W1[3] = SWAB32(nounce);
for(int k=0;k<16;k++)
W1[k] = hefty_gpu_blockHeader[k];
W1[3] = SWAB32(nounce);
// 2. Runde
// 2. Runde
#pragma unroll 16
for(int j=0;j<16;j++)
Absorb(sponge, W1[j] ^ hefty_gpu_constantTable[j]);
for(int j=0;j<16;j++)
Absorb(sponge, W1[j] ^ heftyLookUp(j));
// Progress W1 (Bytes 0...63)
#pragma unroll 16
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] ^ regs[7]);
hefty_gpu_round(regs, W1[j], hefty_gpu_constantTable[j], sponge);
}
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] ^ regs[7]);
hefty_gpu_round(regs, W1[j], heftyLookUp(j), sponge);
}
// Progress W2 (Bytes 64...127) then W3 (Bytes 128...191) ...
#pragma unroll 3
for(int k=0;k<3;k++)
{
#pragma unroll 2
for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 5
for(int j=2;j<7;j++)
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
for(int k=0;k<3;k++)
{
#pragma unroll 2
for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 5
for(int j=2;j<7;j++)
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 8
for(int j=7;j<15;j++)
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
#pragma unroll 8
for(int j=7;j<15;j++)
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
#pragma unroll 16
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] + regs[7]);
hefty_gpu_round(regs, W2[j], hefty_gpu_constantTable[j + 16 * (k+1)], sponge);
}
#pragma unroll 16
for(int j=0;j<16;j++)
W1[j] = W2[j];
}
#pragma unroll 16
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] + regs[7]);
hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge);
}
#pragma unroll 16
for(int j=0;j<16;j++)
W1[j] = W2[j];
}
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
#pragma unroll 8
for(int k=0;k<8;k++)
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
}
for(int k=0;k<8;k++)
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]);
}
}
// Setup-Funktionen
__host__ void hefty_cpu_init(int thr_id, int threads)
{
cudaSetDevice(thr_id);
cudaSetDevice(device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( hefty_gpu_constantTable,
hefty_cpu_constantTable,
sizeof(uint32_t) * 64 );
cudaGetDeviceProperties(&props, device_map[thr_id]);
// Speicher für alle Hefty1 hashes belegen
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
// Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( hefty_gpu_constantTable,
hefty_cpu_constantTable,
sizeof(uint32_t) * 64 );
// Speicher für alle Hefty1 hashes belegen
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
}
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data)
// data muss 84-Byte haben!
// data muss 84-Byte haben!
{
// Nachricht expandieren und setzen
uint32_t msgBlock[32];
// Nachricht expandieren und setzen
uint32_t msgBlock[32];
memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 84);
msgBlock[21] |= 0x80;
msgBlock[31] = 672; // bitlen
for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]);
memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 84);
msgBlock[21] |= 0x80;
msgBlock[31] = 672; // bitlen
for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]);
// die erste Runde wird auf der CPU durchgeführt, da diese für
// alle Threads gleich ist. Der Hash wird dann an die Threads
// übergeben
// die erste Runde wird auf der CPU durchgeführt, da diese für
// alle Threads gleich ist. Der Hash wird dann an die Threads
// übergeben
// Erstelle expandierten Block W
uint32_t W[64];
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
for(int j=16;j<64;j++)
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
// Erstelle expandierten Block W
uint32_t W[64];
memcpy(W, &msgBlock[0], sizeof(uint32_t) * 16);
for(int j=16;j<64;j++)
W[j] = s1(W[j-2]) + W[j-7] + s0(W[j-15]) + W[j-16];
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
uint32_t sponge[4];
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
uint32_t sponge[4];
// pre
memset(sponge, 0, sizeof(uint32_t) * 4);
// pre
memset(sponge, 0, sizeof(uint32_t) * 4);
for (int k=0; k < 8; k++)
{
regs[k] = hefty_cpu_hashTable[k];
hash[k] = regs[k];
}
{
regs[k] = hefty_cpu_hashTable[k];
hash[k] = regs[k];
}
// 1. Runde
for(int j=0;j<16;j++)
Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
// 1. Runde
for(int j=0;j<16;j++)
Absorb(sponge, W[j] ^ hefty_cpu_constantTable[j]);
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] ^ regs[7]);
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
}
for(int j=0;j<16;j++)
{
Absorb(sponge, regs[3] ^ regs[7]);
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
}
for(int j=16;j<64;j++)
{
Absorb(sponge, regs[3] + regs[7]);
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
}
for(int j=16;j<64;j++)
{
Absorb(sponge, regs[3] + regs[7]);
hefty_cpu_round(regs, W[j], hefty_cpu_constantTable[j], sponge);
}
for(int k=0;k<8;k++)
hash[k] += regs[k];
for(int k=0;k<8;k++)
hash[k] += regs[k];
// sponge speichern
// sponge speichern
cudaMemcpyToSymbol( hefty_gpu_sponge,
sponge,
sizeof(uint32_t) * 4 );
// hash speichern
cudaMemcpyToSymbol( hefty_gpu_register,
hash,
sizeof(uint32_t) * 8 );
cudaMemcpyToSymbol( hefty_gpu_sponge,
sponge,
sizeof(uint32_t) * 4 );
// hash speichern
cudaMemcpyToSymbol( hefty_gpu_register,
hash,
sizeof(uint32_t) * 8 );
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
cudaMemcpyToSymbol( hefty_gpu_blockHeader,
&msgBlock[16],
64);
// Blockheader setzen (korrekte Nonce fehlt da drin noch)
cudaMemcpyToSymbol( hefty_gpu_blockHeader,
&msgBlock[16],
64);
}
__host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{
const int threadsperblock = 128;
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
#if USE_SHARED
size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 64 * sizeof(uint32_t);
#else
size_t shared_size = 0;
size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
}

View File

@ -264,7 +264,7 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -5,8 +5,6 @@
#include <stdio.h>
#include <memory.h>
#define W_ALIGNMENT 65
// Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t;
@ -59,8 +57,6 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
nonceVector[thread] = nounce;
// jeder thread in diesem Block bekommt sein eigenes W Array im Shared memory
//extern __shared__ unsigned char s[];
//uint32_t *W = (uint32_t *)(&s[W_ALIGNMENT * sizeof(uint32_t) * threadIdx.x]);
uint32_t W1[16];
uint32_t W2[16];
@ -257,14 +253,13 @@ __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashe
__host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
{
const int threadsperblock = 128;
const int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs (abhängig von der Threadanzahl)
//size_t shared_size = W_ALIGNMENT*sizeof(uint32_t)*threadsperblock; // ein uint32_t eingefügt gegen Bank Konflikte
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);

View File

@ -163,6 +163,41 @@ extern "C" int cuda_num_devices()
return GPU_N;
}
static bool substringsearch(const char *haystack, const char *needle, int &match)
{
int hlen = strlen(haystack);
int nlen = strlen(needle);
for (int i=0; i < hlen; ++i)
{
if (haystack[i] == ' ') continue;
int j=0, x = 0;
while(j < nlen)
{
if (haystack[i+x] == ' ') {++x; continue;}
if (needle[j] == ' ') {++j; continue;}
if (needle[j] == '#') return ++match == needle[j+1]-'0';
if (tolower(haystack[i+x]) != tolower(needle[j])) break;
++j; ++x;
}
if (j == nlen) return true;
}
return false;
}
// CUDA Gerät nach Namen finden (gibt Geräte-Index zurück oder -1)
extern "C" int cuda_finddevice(char *name)
{
int num = cuda_num_devices();
int match = 0;
for (int i=0; i < num; ++i)
{
cudaDeviceProp props;
if (cudaGetDeviceProperties(&props, i) == cudaSuccess)
if (substringsearch(props.name, name, match)) return i;
}
return -1;
}
// Zeitsynchronisations-Routine von cudaminer mit CPU sleep
typedef struct { double value[8]; } tsumarray;
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)