1
0
mirror of https://github.com/GOSTSec/ccminer synced 2025-01-09 22:38:05 +00:00

bump to revision v0.8

This commit is contained in:
Christian Buchner 2014-05-03 21:01:50 +02:00
parent 61cbdc62d0
commit 6c8eff98c0
22 changed files with 316 additions and 256 deletions

View File

@ -18,13 +18,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \
compat/sys/time.h compat/getopt/getopt.h \ compat/sys/time.h compat/getopt/getopt.h \
cpu-miner.c util.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \ cpu-miner.c util.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \
sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \ sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \
heavy.cu \ heavy/heavy.cu \
cuda_blake512.cu cuda_blake512.h \ heavy/cuda_blake512.cu heavy/cuda_blake512.h \
cuda_combine.cu cuda_combine.h \ heavy/cuda_combine.cu heavy/cuda_combine.h \
cuda_groestl512.cu cuda_groestl512.h \ heavy/cuda_groestl512.cu heavy/cuda_groestl512.h \
cuda_hefty1.cu cuda_hefty1.h \ heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
cuda_keccak512.cu cuda_keccak512.h \ heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
cuda_sha256.cu cuda_sha256.h \ heavy/cuda_sha256.cu heavy/cuda_sha256.h \
fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \ fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \

View File

@ -12,9 +12,8 @@ If you find this tool useful and like to support its continued
VTC donation address: VrjeFzMgvteCGarLw85KivBzmsiH9fqp4a VTC donation address: VrjeFzMgvteCGarLw85KivBzmsiH9fqp4a
MAX donation address: mHrhQP9EFArechWxTFJ97s9D3jvcCvEEnt MAX donation address: mHrhQP9EFArechWxTFJ97s9D3jvcCvEEnt
DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP
PANDA donation address: PvgtxJ2ZKaudRogCXfUMLXVaWUMcKQgRed
MRC donation address: 1Lxc4JPDpQRJB8BN4YwhmSQ3Rcu8gjj2Kd
HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN
GRS donation address: FmJKJAhvyHWPeEVeLQHefr2naqgWc9ABTM
*************************************************************** ***************************************************************
>>> Introduction <<< >>> Introduction <<<
@ -35,6 +34,7 @@ its command line interface and options.
-a, --algo=ALGO specify the algorithm to use -a, --algo=ALGO specify the algorithm to use
heavy use to mine Heavycoin heavy use to mine Heavycoin
mjollnir use to mine Mjollnircoin
fugue256 use to mine Fuguecoin fugue256 use to mine Fuguecoin
groestl use to mine Groestlcoin groestl use to mine Groestlcoin
myr-gr use to mine Myriad-Groestl myr-gr use to mine Myriad-Groestl
@ -45,11 +45,12 @@ its command line interface and options.
Alternatively give string names of your card like Alternatively give string names of your card like
gtx780ti or gt640#2 (matching 2nd gt640 in the PC). gtx780ti or gt640#2 (matching 2nd gt640 in the PC).
-f, --diff Divide difficulty by this factor (std is 1) \n\
-v, --vote Heavycoin block vote (default: 512)
-o, --url=URL URL of mining server (default: " DEF_RPC_URL ") -o, --url=URL URL of mining server (default: " DEF_RPC_URL ")
-O, --userpass=U:P username:password pair for mining server -O, --userpass=U:P username:password pair for mining server
-u, --user=USERNAME username for mining server -u, --user=USERNAME username for mining server
-p, --pass=PASSWORD password for mining server -p, --pass=PASSWORD password for mining server
-v, --vote Heavycoin block vote (default: 512)
--cert=FILE certificate for mining server using SSL --cert=FILE certificate for mining server using SSL
-x, --proxy=[PROTOCOL://]HOST[:PORT] connect through a proxy -x, --proxy=[PROTOCOL://]HOST[:PORT] connect through a proxy
-t, --threads=N number of miner threads (default: number of nVidia GPUs in your system) -t, --threads=N number of miner threads (default: number of nVidia GPUs in your system)
@ -116,6 +117,13 @@ from your old clunkers.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
May 3rd 2014 add the MjollnirCoin hash algorithm for the upcomin
MjollnirCoin relaunch.
Add the -f (--diff) option to adjust the difficulty
e.g. for the erebor Dwarfpool myr-gr SaffronCoin pool.
Use -f 256 there.
May 1st 2014 adapt the Jackpot algorithms to changes made by the May 1st 2014 adapt the Jackpot algorithms to changes made by the
coin developers. We keep our unique nVidia advantage coin developers. We keep our unique nVidia advantage
because we have a way to break up the divergence. because we have a way to break up the divergence.

View File

@ -264,14 +264,15 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<ClInclude Include="compat\sys\time.h" /> <ClInclude Include="compat\sys\time.h" />
<ClInclude Include="compat\unistd.h" /> <ClInclude Include="compat\unistd.h" />
<ClInclude Include="cpuminer-config.h" /> <ClInclude Include="cpuminer-config.h" />
<ClInclude Include="cuda_blake512.h" />
<ClInclude Include="cuda_combine.h" />
<ClInclude Include="cuda_groestl512.h" />
<ClInclude Include="cuda_groestlcoin.h" /> <ClInclude Include="cuda_groestlcoin.h" />
<ClInclude Include="cuda_hefty1.h" /> <ClInclude Include="cuda_helper.h" />
<ClInclude Include="cuda_keccak512.h" />
<ClInclude Include="cuda_sha256.h" />
<ClInclude Include="elist.h" /> <ClInclude Include="elist.h" />
<ClInclude Include="heavy\cuda_blake512.h" />
<ClInclude Include="heavy\cuda_combine.h" />
<ClInclude Include="heavy\cuda_groestl512.h" />
<ClInclude Include="heavy\cuda_hefty1.h" />
<ClInclude Include="heavy\cuda_keccak512.h" />
<ClInclude Include="heavy\cuda_sha256.h" />
<ClInclude Include="hefty1.h" /> <ClInclude Include="hefty1.h" />
<ClInclude Include="miner.h" /> <ClInclude Include="miner.h" />
<ClInclude Include="sph\sph_blake.h" /> <ClInclude Include="sph\sph_blake.h" />
@ -290,16 +291,16 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<ClInclude Include="uint256.h" /> <ClInclude Include="uint256.h" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<CudaCompile Include="cuda_blake512.cu" />
<CudaCompile Include="cuda_combine.cu" />
<CudaCompile Include="cuda_fugue256.cu" /> <CudaCompile Include="cuda_fugue256.cu" />
<CudaCompile Include="cuda_groestl512.cu" />
<CudaCompile Include="cuda_groestlcoin.cu" /> <CudaCompile Include="cuda_groestlcoin.cu" />
<CudaCompile Include="cuda_hefty1.cu" />
<CudaCompile Include="cuda_keccak512.cu" />
<CudaCompile Include="cuda_myriadgroestl.cu" /> <CudaCompile Include="cuda_myriadgroestl.cu" />
<CudaCompile Include="cuda_sha256.cu" /> <CudaCompile Include="heavy\cuda_blake512.cu" />
<CudaCompile Include="heavy.cu" /> <CudaCompile Include="heavy\cuda_combine.cu" />
<CudaCompile Include="heavy\cuda_groestl512.cu" />
<CudaCompile Include="heavy\cuda_hefty1.cu" />
<CudaCompile Include="heavy\cuda_keccak512.cu" />
<CudaCompile Include="heavy\cuda_sha256.cu" />
<CudaCompile Include="heavy\heavy.cu" />
<CudaCompile Include="JHA\cuda_jha_compactionTest.cu" /> <CudaCompile Include="JHA\cuda_jha_compactionTest.cu" />
<CudaCompile Include="JHA\cuda_jha_keccak512.cu" /> <CudaCompile Include="JHA\cuda_jha_keccak512.cu" />
<CudaCompile Include="JHA\jackpotcoin.cu" /> <CudaCompile Include="JHA\jackpotcoin.cu" />

View File

@ -46,6 +46,12 @@
<Filter Include="Header Files\sph"> <Filter Include="Header Files\sph">
<UniqueIdentifier>{7c2a98c6-064c-4a69-b803-d6f6ff5edd0b}</UniqueIdentifier> <UniqueIdentifier>{7c2a98c6-064c-4a69-b803-d6f6ff5edd0b}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Source Files\CUDA\heavy">
<UniqueIdentifier>{c3222908-22ba-4586-a637-6363f455b06d}</UniqueIdentifier>
</Filter>
<Filter Include="Header Files\CUDA\heavy">
<UniqueIdentifier>{3281db48-f394-49ea-a1ef-6ebd09828d50}</UniqueIdentifier>
</Filter>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ClCompile Include="compat\jansson\dump.c"> <ClCompile Include="compat\jansson\dump.c">
@ -167,24 +173,6 @@
<ClInclude Include="hefty1.h"> <ClInclude Include="hefty1.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="cuda_sha256.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
<ClInclude Include="cuda_hefty1.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
<ClInclude Include="cuda_keccak512.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
<ClInclude Include="cuda_combine.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
<ClInclude Include="cuda_blake512.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
<ClInclude Include="cuda_groestl512.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
<ClInclude Include="uint256.h"> <ClInclude Include="uint256.h">
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>
@ -230,29 +218,29 @@
<ClInclude Include="sph\sph_types.h"> <ClInclude Include="sph\sph_types.h">
<Filter>Header Files\sph</Filter> <Filter>Header Files\sph</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="heavy\cuda_blake512.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_combine.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_groestl512.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_hefty1.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_keccak512.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="heavy\cuda_sha256.h">
<Filter>Header Files\CUDA\heavy</Filter>
</ClInclude>
<ClInclude Include="cuda_helper.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<CudaCompile Include="cuda_sha256.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="cuda_blake512.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="cuda_groestl512.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="cuda_hefty1.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="cuda_keccak512.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="cuda_combine.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="heavy.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="cuda_fugue256.cu"> <CudaCompile Include="cuda_fugue256.cu">
<Filter>Source Files\CUDA</Filter> <Filter>Source Files\CUDA</Filter>
</CudaCompile> </CudaCompile>
@ -286,5 +274,26 @@
<CudaCompile Include="quark\cuda_skein512.cu"> <CudaCompile Include="quark\cuda_skein512.cu">
<Filter>Source Files\CUDA\quark</Filter> <Filter>Source Files\CUDA\quark</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="heavy\cuda_blake512.cu">
<Filter>Source Files\CUDA\heavy</Filter>
</CudaCompile>
<CudaCompile Include="heavy\cuda_combine.cu">
<Filter>Source Files\CUDA\heavy</Filter>
</CudaCompile>
<CudaCompile Include="heavy\cuda_groestl512.cu">
<Filter>Source Files\CUDA\heavy</Filter>
</CudaCompile>
<CudaCompile Include="heavy\cuda_hefty1.cu">
<Filter>Source Files\CUDA\heavy</Filter>
</CudaCompile>
<CudaCompile Include="heavy\cuda_keccak512.cu">
<Filter>Source Files\CUDA\heavy</Filter>
</CudaCompile>
<CudaCompile Include="heavy\cuda_sha256.cu">
<Filter>Source Files\CUDA\heavy</Filter>
</CudaCompile>
<CudaCompile Include="heavy\heavy.cu">
<Filter>Source Files\CUDA\heavy</Filter>
</CudaCompile>
</ItemGroup> </ItemGroup>
</Project> </Project>

View File

@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.05.01]) AC_INIT([ccminer], [2014.05.03])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

View File

@ -47,6 +47,7 @@
#define PROGRAM_NAME "minerd" #define PROGRAM_NAME "minerd"
#define LP_SCANTIME 60 #define LP_SCANTIME 60
#define HEAVYCOIN_BLKHDR_SZ 84 #define HEAVYCOIN_BLKHDR_SZ 84
#define MNR_BLKHDR_SZ 80
// from heavy.cu // from heavy.cu
#ifdef __cplusplus #ifdef __cplusplus
@ -121,6 +122,7 @@ struct workio_cmd {
typedef enum { typedef enum {
ALGO_HEAVY, /* Heavycoin hash */ ALGO_HEAVY, /* Heavycoin hash */
ALGO_MJOLLNIR, /* Mjollnir hash */
ALGO_FUGUE256, /* Fugue256 */ ALGO_FUGUE256, /* Fugue256 */
ALGO_GROESTL, ALGO_GROESTL,
ALGO_MYR_GR, ALGO_MYR_GR,
@ -129,6 +131,7 @@ typedef enum {
static const char *algo_names[] = { static const char *algo_names[] = {
"heavy", "heavy",
"mjollnir",
"fugue256", "fugue256",
"groestl", "groestl",
"myr-gr", "myr-gr",
@ -154,6 +157,7 @@ static json_t *opt_config;
static const bool opt_time = true; static const bool opt_time = true;
static sha256_algos opt_algo = ALGO_HEAVY; static sha256_algos opt_algo = ALGO_HEAVY;
static int opt_n_threads = 0; static int opt_n_threads = 0;
static double opt_difficulty = 1; // CH
bool opt_trust_pool = false; bool opt_trust_pool = false;
uint16_t opt_vote = 9999; uint16_t opt_vote = 9999;
static int num_processors; static int num_processors;
@ -195,6 +199,7 @@ Options:\n\
-a, --algo=ALGO specify the algorithm to use\n\ -a, --algo=ALGO specify the algorithm to use\n\
fugue256 Fuguecoin hash\n\ fugue256 Fuguecoin hash\n\
heavy Heavycoin hash\n\ heavy Heavycoin hash\n\
mjollnir Mjollnircoin hash\n\
groestl Groestlcoin hash\n\ groestl Groestlcoin hash\n\
myr-gr Myriad-Groestl hash\n\ myr-gr Myriad-Groestl hash\n\
jackpot Jackpot hash\n\ jackpot Jackpot hash\n\
@ -244,7 +249,7 @@ static char const short_options[] =
#ifdef HAVE_SYSLOG_H #ifdef HAVE_SYSLOG_H
"S" "S"
#endif #endif
"a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:"; "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:";
static struct option const options[] = { static struct option const options[] = {
{ "algo", 1, NULL, 'a' }, { "algo", 1, NULL, 'a' },
@ -277,6 +282,7 @@ static struct option const options[] = {
{ "userpass", 1, NULL, 'O' }, { "userpass", 1, NULL, 'O' },
{ "version", 0, NULL, 'V' }, { "version", 0, NULL, 'V' },
{ "devices", 1, NULL, 'd' }, { "devices", 1, NULL, 'd' },
{ "diff", 1, NULL, 'f' },
{ 0, 0, 0, 0 } { 0, 0, 0, 0 }
}; };
@ -684,7 +690,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
/* Generate merkle root */ /* Generate merkle root */
if (opt_algo == ALGO_HEAVY) if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
else else
if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL) if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL)
@ -694,7 +700,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
for (i = 0; i < sctx->job.merkle_count; i++) { for (i = 0; i < sctx->job.merkle_count; i++) {
memcpy(merkle_root + 32, sctx->job.merkle[i], 32); memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
if (opt_algo == ALGO_HEAVY) if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
heavycoin_hash(merkle_root, merkle_root, 64); heavycoin_hash(merkle_root, merkle_root, 64);
else else
sha256d(merkle_root, merkle_root, 64); sha256d(merkle_root, merkle_root, 64);
@ -738,11 +744,11 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
} }
if (opt_algo == ALGO_JACKPOT) if (opt_algo == ALGO_JACKPOT)
diff_to_target(work->target, sctx->job.diff / 65536.0); diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL) else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL)
diff_to_target(work->target, sctx->job.diff / 256.0); diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
else else
diff_to_target(work->target, sctx->job.diff); diff_to_target(work->target, sctx->job.diff / opt_difficulty);
} }
static void *miner_thread(void *userdata) static void *miner_thread(void *userdata)
@ -836,7 +842,12 @@ static void *miner_thread(void *userdata)
case ALGO_HEAVY: case ALGO_HEAVY:
rc = scanhash_heavy(thr_id, work.data, work.target, rc = scanhash_heavy(thr_id, work.data, work.target,
max_nonce, &hashes_done, work.maxvote); max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
break;
case ALGO_MJOLLNIR:
rc = scanhash_heavy(thr_id, work.data, work.target,
max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ);
break; break;
case ALGO_FUGUE256: case ALGO_FUGUE256:
@ -1112,6 +1123,7 @@ static void parse_arg (int key, char *arg)
{ {
char *p; char *p;
int v, i; int v, i;
double d;
switch(key) { switch(key) {
case 'a': case 'a':
@ -1309,6 +1321,12 @@ static void parse_arg (int key, char *arg)
} }
} }
break; break;
case 'f': // CH - Divisor for Difficulty
d = atof(arg);
if (d == 0) /* sanity check */
show_usage_and_exit(1);
opt_difficulty = d;
break;
case 'V': case 'V':
show_version_and_exit(); show_version_and_exit();
case 'h': case 'h':
@ -1404,7 +1422,7 @@ static void signal_handler(int sig)
} }
#endif #endif
#define PROGRAM_VERSION "0.7" #define PROGRAM_VERSION "0.8"
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
struct thr_info *thr; struct thr_info *thr;

View File

@ -152,7 +152,7 @@
#define PACKAGE_NAME "ccminer" #define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */ /* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.05.01" #define PACKAGE_STRING "ccminer 2014.05.03"
/* Define to the one symbol short name of this package. */ /* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME #undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
#undef PACKAGE_URL #undef PACKAGE_URL
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "2014.05.01" #define PACKAGE_VERSION "2014.05.03"
/* If using the C implementation of alloca, define if you know the /* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be direction of stack growth for your system; otherwise it will be

View File

@ -16,9 +16,6 @@ extern int device_map[8];
// aus heavy.cu // aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// aus driver.c
extern "C" void set_device(int device);
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t; typedef unsigned char uint8_t;
typedef unsigned int uint32_t; typedef unsigned int uint32_t;

View File

@ -17,8 +17,8 @@ extern uint32_t *d_nonceVector[8];
// globaler Speicher für unsere Ergebnisse // globaler Speicher für unsere Ergebnisse
uint32_t *d_hash5output[8]; uint32_t *d_hash5output[8];
// die Message (116 Bytes) mit Padding zur Berechnung auf der GPU // die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU
__constant__ uint64_t c_PaddedMessage[16]; // padded message (84+32 bytes + padding) __constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding)
// ---------------------------- BEGIN CUDA blake512 functions ------------------------------------ // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------
@ -44,10 +44,12 @@ const uint8_t host_sigma[16][16] =
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
}; };
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
#define SWAP32(x) \ #define SWAP32(x) \
((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \
(((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam.
#define SWAP64(x) \ #define SWAP64(x) \
((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
@ -58,11 +60,11 @@ const uint8_t host_sigma[16][16] =
(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
(((uint64_t)(x) & 0x00000000000000ffULL) << 56))) (((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
__constant__ uint64_t c_SecondRound[16]; __constant__ uint64_t c_SecondRound[15];
const uint64_t host_SecondRound[16] = const uint64_t host_SecondRound[15] =
{ {
0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0,SWAP64(0x3A0) 0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0
}; };
__constant__ uint64_t c_u512[16]; __constant__ uint64_t c_u512[16];
@ -80,24 +82,22 @@ const uint64_t host_u512[16] =
}; };
#define ROTR(x,n) (((x)<<(64-n))|( (x)>>(n)))
#define G(a,b,c,d,e) \ #define G(a,b,c,d,e) \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
v[d] = ROTR( v[d] ^ v[a],32); \ v[d] = ROTR64( v[d] ^ v[a],32); \
v[c] += v[d]; \ v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c],25); \ v[b] = ROTR64( v[b] ^ v[c],25); \
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \
v[d] = ROTR( v[d] ^ v[a],16); \ v[d] = ROTR64( v[d] ^ v[a],16); \
v[c] += v[d]; \ v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c],11); v[b] = ROTR64( v[b] ^ v[c],11);
__device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 ) template <int BLOCKSIZE> __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 )
{ {
uint64_t v[16], m[16], i; uint64_t v[16], m[16], i;
#pragma unroll 16 #pragma unroll 16
for( i = 0; i < 16; ++i ) m[i] = SWAP64(block[i]); for( i = 0; i < 16; ++i ) m[i] = cuda_swab64(block[i]);
#pragma unroll 8 #pragma unroll 8
for( i = 0; i < 8; ++i ) v[i] = h[i]; for( i = 0; i < 8; ++i ) v[i] = h[i];
@ -113,11 +113,11 @@ __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt
/* don't xor t when the block is only padding */ /* don't xor t when the block is only padding */
if ( !nullt ) { if ( !nullt ) {
v[12] ^= 928; v[12] ^= 8*(BLOCKSIZE+32);
v[13] ^= 928; v[13] ^= 8*(BLOCKSIZE+32);
} }
#pragma unroll 16 //#pragma unroll 16
for( i = 0; i < 16; ++i ) for( i = 0; i < 16; ++i )
{ {
/* column step */ /* column step */
@ -136,49 +136,9 @@ __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
} }
// Endian Drehung für 32 Bit Typen #include "cuda_helper.h"
static __device__ uint32_t cuda_swab32(uint32_t x)
{
return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u)
| ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu));
}
// Endian Drehung für 64 Bit Typen template <int BLOCKSIZE> __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
static __device__ uint64_t cuda_swab64(uint64_t x) {
uint32_t h = (x >> 32);
uint32_t l = (x & 0xFFFFFFFFULL);
return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h));
}
// das Hi Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t HIWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2hiint(__longlong_as_double(x));
#else
return (uint32_t)(x >> 32);
#endif
}
// das Hi Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL);
}
// das Lo Word aus einem 64 Bit Typen extrahieren
static __device__ uint32_t LOWORD(const uint64_t &x) {
#if __CUDA_ARCH__ >= 130
return (uint32_t)__double2loint(__longlong_as_double(x));
#else
return (uint32_t)(x & 0xFFFFFFFFULL);
#endif
}
// das Lo Word in einem 64 Bit Typen ersetzen
static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) {
return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
}
__global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -211,8 +171,10 @@ __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outpu
// die Nounce durch die thread-spezifische ersetzen // die Nounce durch die thread-spezifische ersetzen
buf[9] = REPLACE_HIWORD(buf[9], nounce); buf[9] = REPLACE_HIWORD(buf[9], nounce);
// den thread-spezifischen Hefty1 hash einsetzen
uint32_t *hefty = heftyHashes + 8 * hashPosition; uint32_t *hefty = heftyHashes + 8 * hashPosition;
if (BLOCKSIZE == 84) {
// den thread-spezifischen Hefty1 hash einsetzen
// aufwändig, weil das nicht mit uint64_t Wörtern aligned ist.
buf[10] = REPLACE_HIWORD(buf[10], hefty[0]); buf[10] = REPLACE_HIWORD(buf[10], hefty[0]);
buf[11] = REPLACE_LOWORD(buf[11], hefty[1]); buf[11] = REPLACE_LOWORD(buf[11], hefty[1]);
buf[11] = REPLACE_HIWORD(buf[11], hefty[2]); buf[11] = REPLACE_HIWORD(buf[11], hefty[2]);
@ -221,30 +183,28 @@ __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outpu
buf[13] = REPLACE_LOWORD(buf[13], hefty[5]); buf[13] = REPLACE_LOWORD(buf[13], hefty[5]);
buf[13] = REPLACE_HIWORD(buf[13], hefty[6]); buf[13] = REPLACE_HIWORD(buf[13], hefty[6]);
buf[14] = REPLACE_LOWORD(buf[14], hefty[7]); buf[14] = REPLACE_LOWORD(buf[14], hefty[7]);
}
else if (BLOCKSIZE == 80) {
buf[10] = MAKE_ULONGLONG(hefty[0], hefty[1]);
buf[11] = MAKE_ULONGLONG(hefty[2], hefty[3]);
buf[12] = MAKE_ULONGLONG(hefty[4], hefty[5]);
buf[13] = MAKE_ULONGLONG(hefty[6], hefty[7]);
}
// erste Runde // erste Runde
blake512_compress( h, buf, 0, c_sigma, c_u512 ); blake512_compress<BLOCKSIZE>( h, buf, 0, c_sigma, c_u512 );
// zweite Runde // zweite Runde
#pragma unroll 16 #pragma unroll 15
for (int i=0; i < 16; ++i) buf[i] = c_SecondRound[i]; for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i];
blake512_compress( h, buf, 1, c_sigma, c_u512 ); buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen
blake512_compress<BLOCKSIZE>( h, buf, 1, c_sigma, c_u512 );
// Hash rauslassen // Hash rauslassen
#if 0
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
uint32_t *outHash = (uint32_t *)outputHash + 16 * hashPosition;
#pragma unroll 8
for (int i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) );
}
#else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition; uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition;
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] ); for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] );
#endif
} }
} }
@ -274,22 +234,30 @@ __host__ void blake512_cpu_init(int thr_id, int threads)
cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads); cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads);
} }
__host__ void blake512_cpu_setBlock(void *pdata) static int BLOCKSIZE = 84;
__host__ void blake512_cpu_setBlock(void *pdata, int len)
// data muss 84-Byte haben! // data muss 84-Byte haben!
// heftyHash hat 32-Byte // heftyHash hat 32-Byte
{ {
// Message mit Padding für erste Runde bereitstellen
unsigned char PaddedMessage[128]; unsigned char PaddedMessage[128];
if (len == 84) {
// Message mit Padding für erste Runde bereitstellen
memcpy(PaddedMessage, pdata, 84); memcpy(PaddedMessage, pdata, 84);
memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+116, 0, 12); memset(PaddedMessage+116, 0, 12);
PaddedMessage[116] = 0x80; PaddedMessage[116] = 0x80;
} else if (len == 80) {
memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen
memset(PaddedMessage+112, 0, 16);
PaddedMessage[112] = 0x80;
}
// die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU // die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU
cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
BLOCKSIZE = len;
} }
__host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
{ {
const int threadsperblock = 256; const int threadsperblock = 256;
@ -303,5 +271,8 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
blake512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); if (BLOCKSIZE == 80)
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 84)
blake512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
} }

View File

@ -2,7 +2,6 @@
#define _CUDA_BLAKE512_H #define _CUDA_BLAKE512_H
void blake512_cpu_init(int thr_id, int threads); void blake512_cpu_init(int thr_id, int threads);
void blake512_cpu_setBlock(void *pdata); void blake512_cpu_setBlock(void *pdata, int len);
void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce); void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
#endif #endif

View File

@ -676,7 +676,7 @@ __device__ void groestl512_perm_Q(uint32_t *a)
} }
} }
__global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) template <int BLOCKSIZE> __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -706,7 +706,7 @@ __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *out
uint32_t *heftyHash = &heftyHashes[8 * hashPosition]; uint32_t *heftyHash = &heftyHashes[8 * hashPosition];
#pragma unroll 8 #pragma unroll 8
for (int k=0; k<8; ++k) for (int k=0; k<8; ++k)
message[21+k] = heftyHash[k]; message[BLOCKSIZE/4+k] = heftyHash[k];
uint32_t g[32]; uint32_t g[32];
#pragma unroll 32 #pragma unroll 32
@ -764,21 +764,27 @@ __host__ void groestl512_cpu_init(int thr_id, int threads)
cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads); cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads);
} }
__host__ void groestl512_cpu_setBlock(void *data) static int BLOCKSIZE = 84;
// data muss 84-Byte haben!
__host__ void groestl512_cpu_setBlock(void *data, int len)
// data muss 80/84-Byte haben!
// heftyHash hat 32-Byte // heftyHash hat 32-Byte
{ {
// Nachricht expandieren und setzen // Nachricht expandieren und setzen
uint32_t msgBlock[32]; uint32_t msgBlock[32];
memset(msgBlock, 0, sizeof(uint32_t) * 32); memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 84); memcpy(&msgBlock[0], data, len);
// Erweitere die Nachricht auf den Nachrichtenblock (padding) // Erweitere die Nachricht auf den Nachrichtenblock (padding)
// Unsere Nachricht hat 116 Byte // Unsere Nachricht hat 112/116 Byte
if (len == 84) {
msgBlock[29] = 0x80; msgBlock[29] = 0x80;
msgBlock[31] = 0x01000000; msgBlock[31] = 0x01000000;
} else if (len == 80) {
msgBlock[28] = 0x80;
msgBlock[31] = 0x01000000;
}
// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
// auf der GPU ausgeführt) // auf der GPU ausgeführt)
@ -796,6 +802,8 @@ __host__ void groestl512_cpu_setBlock(void *data)
cudaMemcpyToSymbol( groestl_gpu_msg, cudaMemcpyToSymbol( groestl_gpu_msg,
msgBlock, msgBlock,
128); 128);
BLOCKSIZE = len;
} }
__host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) __host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
@ -818,5 +826,8 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
groestl512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); if (BLOCKSIZE == 84)
groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 80)
groestl512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
} }

View File

@ -3,7 +3,7 @@
void groestl512_cpu_init(int thr_id, int threads); void groestl512_cpu_init(int thr_id, int threads);
void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
void groestl512_cpu_setBlock(void *data); void groestl512_cpu_setBlock(void *data, int len);
void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce); void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce);
#endif #endif

View File

@ -2,21 +2,24 @@
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
// aus cpu-miner.c
extern int device_map[8];
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
#define USE_SHARED 1 #define USE_SHARED 1
// aus cpu-miner.c
extern int device_map[8];
// aus heavy.cu
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
typedef unsigned int uint32_t; typedef unsigned int uint32_t;
typedef unsigned char uint8_t; typedef unsigned char uint8_t;
typedef unsigned short uint16_t; typedef unsigned short uint16_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props; static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads // globaler Speicher für alle HeftyHashes aller Threads
uint32_t *d_heftyHashes[8]; uint32_t *d_heftyHashes[8];
@ -286,7 +289,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
{ {
Absorb(sponge, regs[3] + regs[7]); Absorb(sponge, regs[3] + regs[7]);
hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge); hefty_gpu_round(regs, W2[j], heftyLookUp(j + ((k+1)<<4)), sponge);
} }
#pragma unroll 16 #pragma unroll 16
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
@ -299,7 +302,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa
#pragma unroll 8 #pragma unroll 8
for(int k=0;k<8;k++) for(int k=0;k<8;k++)
((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); ((uint32_t*)outputHash)[(thread<<3)+k] = SWAB32(hash[k]);
} }
} }
@ -308,7 +311,7 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props, device_map[thr_id]); cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Kopiere die Hash-Tabellen in den GPU-Speicher // Kopiere die Hash-Tabellen in den GPU-Speicher
cudaMemcpyToSymbol( hefty_gpu_constantTable, cudaMemcpyToSymbol( hefty_gpu_constantTable,
@ -319,16 +322,21 @@ __host__ void hefty_cpu_init(int thr_id, int threads)
cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads);
} }
__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data) __host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len)
// data muss 84-Byte haben! // data muss 80/84-Byte haben!
{ {
// Nachricht expandieren und setzen // Nachricht expandieren und setzen
uint32_t msgBlock[32]; uint32_t msgBlock[32];
memset(msgBlock, 0, sizeof(uint32_t) * 32); memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 84); memcpy(&msgBlock[0], data, len);
if (len == 84) {
msgBlock[21] |= 0x80; msgBlock[21] |= 0x80;
msgBlock[31] = 672; // bitlen msgBlock[31] = 672; // bitlen
} else if (len == 80) {
msgBlock[20] |= 0x80;
msgBlock[31] = 640; // bitlen
}
for(int i=0;i<31;i++) // Byteorder drehen for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]); msgBlock[i] = SWAB32(msgBlock[i]);
@ -395,7 +403,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
{ {
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads. // alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512; int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid((threads + threadsperblock-1)/threadsperblock);
@ -411,4 +419,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);
} }

View File

@ -2,7 +2,7 @@
#define _CUDA_HEFTY1_H #define _CUDA_HEFTY1_H
void hefty_cpu_hash(int thr_id, int threads, int startNounce); void hefty_cpu_hash(int thr_id, int threads, int startNounce);
void hefty_cpu_setBlock(int thr_id, int threads, void *data); void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len);
void hefty_cpu_init(int thr_id, int threads); void hefty_cpu_init(int thr_id, int threads);
#endif #endif

View File

@ -16,6 +16,8 @@ extern uint32_t *d_nonceVector[8];
// globaler Speicher für unsere Ergebnisse // globaler Speicher für unsere Ergebnisse
uint32_t *d_hash3output[8]; uint32_t *d_hash3output[8];
extern uint32_t *d_hash4output[8];
extern uint32_t *d_hash5output[8];
// der Keccak512 State nach der ersten Runde (72 Bytes) // der Keccak512 State nach der ersten Runde (72 Bytes)
__constant__ uint64_t c_State[25]; __constant__ uint64_t c_State[25];
@ -25,7 +27,7 @@ __constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (No
// ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------ // ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------
#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) #include "cuda_helper.h"
#define U32TO64_LE(p) \ #define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
@ -145,7 +147,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const
} }
// Die Hash-Funktion // Die Hash-Funktion
__global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) template <int BLOCKSIZE> __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -171,7 +173,7 @@ __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outp
msgBlock[1] = nounce; msgBlock[1] = nounce;
// den individuellen Hefty1 Hash einsetzen // den individuellen Hefty1 Hash einsetzen
mycpy32(&msgBlock[3], &heftyHashes[8 * hashPosition]); mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]);
// den Block einmal gut durchschütteln // den Block einmal gut durchschütteln
keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants); keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants);
@ -184,7 +186,6 @@ __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outp
U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
} }
// und ins Global Memory rausschreiben // und ins Global Memory rausschreiben
#pragma unroll 16 #pragma unroll 16
for(int k=0;k<16;k++) for(int k=0;k<16;k++)
@ -217,38 +218,49 @@ __host__ void keccak512_cpu_init(int thr_id, int threads)
// --------------- END keccak512 CPU version from scrypt-jane code -------------------- // --------------- END keccak512 CPU version from scrypt-jane code --------------------
__host__ void keccak512_cpu_setBlock(void *data) static int BLOCKSIZE = 84;
// data muss 84-Byte haben!
__host__ void keccak512_cpu_setBlock(void *data, int len)
// data muss 80 oder 84-Byte haben!
// heftyHash hat 32-Byte // heftyHash hat 32-Byte
{ {
// CH // CH
// state init // state init
uint64_t keccak_cpu_state[25]; uint64_t keccak_cpu_state[25];
memset(keccak_cpu_state, 0, 200); memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state));
// erste Runde
keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
// state kopieren
cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
// keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke
// zu jeweils // zu jeweils
uint32_t msgBlock[18]; uint32_t msgBlock[18];
memset(msgBlock, 0, 18 * sizeof(uint32_t)); memset(msgBlock, 0, 18 * sizeof(uint32_t));
// kopiere die Daten rein (aber nur alles nach Bit 72) // kopiere die restlichen Daten rein (aber nur alles nach Byte 72)
if (len == 84)
memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12); memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12);
else if (len == 80)
memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8);
// Nachricht abschließen // Nachricht abschließen
if (len == 84)
msgBlock[11] = 0x01; msgBlock[11] = 0x01;
else if (len == 80)
msgBlock[10] = 0x01;
msgBlock[17] = 0x80000000; msgBlock[17] = 0x80000000;
// erste Runde
keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants);
// Message 2 ins Constant Memory kopieren (die variable Nonce und // Message 2 ins Constant Memory kopieren (die variable Nonce und
// der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden) // der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden)
cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice ); cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice );
// state kopieren BLOCKSIZE = len;
cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
} }
__host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) __host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
{ {
// Hefty1 Hashes kopieren // Hefty1 Hashes kopieren
@ -268,6 +280,8 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE==84)
keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE==80)
keccak512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
} }

View File

@ -2,7 +2,7 @@
#define _CUDA_KECCAK512_H #define _CUDA_KECCAK512_H
void keccak512_cpu_init(int thr_id, int threads); void keccak512_cpu_init(int thr_id, int threads);
void keccak512_cpu_setBlock(void *data); void keccak512_cpu_setBlock(void *data, int len);
void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce); void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce);

View File

@ -47,7 +47,7 @@ uint32_t sha256_cpu_constantTable[] = {
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) #define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
// Die Hash-Funktion // Die Hash-Funktion
__global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) template <int BLOCKSIZE> __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector)
{ {
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -82,11 +82,10 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH
uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x); uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x);
#pragma unroll 8 #pragma unroll 8
for(int k=0;k<8;k++) for(int k=0;k<8;k++)
W1[5+k] = heftyHashes[offset + k]; W1[((BLOCKSIZE-64)/4)+k] = heftyHashes[offset + k];
#pragma unroll 8 #pragma unroll 8
for (int i=5; i <5+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;) for (int i=((BLOCKSIZE-64)/4); i < ((BLOCKSIZE-64)/4)+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;)
W1[3] = SWAB32(nounce); W1[3] = SWAB32(nounce);
// Progress W1 // Progress W1
@ -178,18 +177,26 @@ __host__ void sha256_cpu_init(int thr_id, int threads)
cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads); cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads);
} }
__host__ void sha256_cpu_setBlock(void *data) static int BLOCKSIZE = 84;
// data muss 84-Byte haben!
__host__ void sha256_cpu_setBlock(void *data, int len)
// data muss 80/84-Byte haben!
// heftyHash hat 32-Byte // heftyHash hat 32-Byte
{ {
// Nachricht expandieren und setzen // Nachricht expandieren und setzen
uint32_t msgBlock[32]; uint32_t msgBlock[32];
memset(msgBlock, 0, sizeof(uint32_t) * 32); memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 84); memcpy(&msgBlock[0], data, len);
if (len == 84) {
memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
msgBlock[29] |= 0x80; msgBlock[29] |= 0x80;
msgBlock[31] = 928; // bitlen msgBlock[31] = 928; // bitlen
} else if (len == 80) {
memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen
msgBlock[28] |= 0x80;
msgBlock[31] = 896; // bitlen
}
for(int i=0;i<31;i++) // Byteorder drehen for(int i=0;i<31;i++) // Byteorder drehen
msgBlock[i] = SWAB32(msgBlock[i]); msgBlock[i] = SWAB32(msgBlock[i]);
@ -242,6 +249,8 @@ __host__ void sha256_cpu_setBlock(void *data)
cudaMemcpyToSymbol( sha256_gpu_blockHeader, cudaMemcpyToSymbol( sha256_gpu_blockHeader,
&msgBlock[16], &msgBlock[16],
64); 64);
BLOCKSIZE = len;
} }
__host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy)
@ -263,6 +272,9 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE == 84)
sha256_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 80) {
sha256_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
}
} }

View File

@ -2,7 +2,7 @@
#define _CUDA_SHA256_H #define _CUDA_SHA256_H
void sha256_cpu_init(int thr_id, int threads); void sha256_cpu_init(int thr_id, int threads);
void sha256_cpu_setBlock(void *data); void sha256_cpu_setBlock(void *data, int len);
void sha256_cpu_hash(int thr_id, int threads, int startNounce); void sha256_cpu_hash(int thr_id, int threads, int startNounce);
void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy);
#endif #endif

View File

@ -22,12 +22,12 @@
#include "sph/sph_blake.h" #include "sph/sph_blake.h"
#include "sph/sph_groestl.h" #include "sph/sph_groestl.h"
#include "cuda_hefty1.h" #include "heavy/cuda_hefty1.h"
#include "cuda_sha256.h" #include "heavy/cuda_sha256.h"
#include "cuda_keccak512.h" #include "heavy/cuda_keccak512.h"
#include "cuda_groestl512.h" #include "heavy/cuda_groestl512.h"
#include "cuda_blake512.h" #include "heavy/cuda_blake512.h"
#include "cuda_combine.h" #include "heavy/cuda_combine.h"
extern uint32_t *d_hash2output[8]; extern uint32_t *d_hash2output[8];
extern uint32_t *d_hash3output[8]; extern uint32_t *d_hash3output[8];
@ -35,6 +35,7 @@ extern uint32_t *d_hash4output[8];
extern uint32_t *d_hash5output[8]; extern uint32_t *d_hash5output[8];
#define HEAVYCOIN_BLKHDR_SZ 84 #define HEAVYCOIN_BLKHDR_SZ 84
#define MNR_BLKHDR_SZ 80
// nonce-array für die threads // nonce-array für die threads
uint32_t *d_nonceVector[8]; uint32_t *d_nonceVector[8];
@ -230,24 +231,29 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done, uint32_t maxvote); unsigned long *hashes_done, uint32_t maxvote, int blocklen);
extern "C" extern "C"
int scanhash_heavy(int thr_id, uint32_t *pdata, int scanhash_heavy(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done, uint32_t maxvote) unsigned long *hashes_done, uint32_t maxvote, int blocklen)
{ {
return scanhash_heavy_cpp(thr_id, pdata, return scanhash_heavy_cpp(thr_id, pdata,
ptarget, max_nonce, hashes_done, maxvote); ptarget, max_nonce, hashes_done, maxvote, blocklen);
} }
extern bool opt_benchmark;
int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done, uint32_t maxvote) unsigned long *hashes_done, uint32_t maxvote, int blocklen)
{ {
// CUDA will process thousands of threads. // CUDA will process thousands of threads.
const int throughput = 4096 * 128; const int throughput = 4096 * 128;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000000ff;
int rc = 0; int rc = 0;
uint32_t *hash = NULL; uint32_t *hash = NULL;
cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)); cudaMallocHost(&hash, throughput*8*sizeof(uint32_t));
@ -258,7 +264,6 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
memset(nrmCalls, 0, sizeof(int) * 6); memset(nrmCalls, 0, sizeof(int) * 6);
uint32_t start_nonce = pdata[19]; uint32_t start_nonce = pdata[19];
uint16_t *ext = (uint16_t *)&pdata[20];
// für jeden Hash ein individuelles Target erstellen basierend // für jeden Hash ein individuelles Target erstellen basierend
// auf dem höchsten Bit, das in ptarget gesetzt ist. // auf dem höchsten Bit, das in ptarget gesetzt ist.
@ -282,6 +287,9 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput); cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput);
} }
if (blocklen == HEAVYCOIN_BLKHDR_SZ)
{
uint16_t *ext = (uint16_t *)&pdata[20];
if (opt_vote > maxvote) { if (opt_vote > maxvote) {
printf("Warning: Your block reward vote (%hu) exceeds " printf("Warning: Your block reward vote (%hu) exceeds "
@ -295,13 +303,14 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
} }
else else
ext[0] = opt_vote; ext[0] = opt_vote;
}
// Setze die Blockdaten // Setze die Blockdaten
hefty_cpu_setBlock(thr_id, throughput, pdata); hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
sha256_cpu_setBlock(pdata); sha256_cpu_setBlock(pdata, blocklen);
keccak512_cpu_setBlock(pdata); keccak512_cpu_setBlock(pdata, blocklen);
groestl512_cpu_setBlock(pdata); groestl512_cpu_setBlock(pdata, blocklen);
blake512_cpu_setBlock(pdata); blake512_cpu_setBlock(pdata, blocklen);
do { do {
int i; int i;
@ -370,7 +379,7 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata,
if (fulltest(foundhash, ptarget)) { if (fulltest(foundhash, ptarget)) {
uint32_t verification[8]; uint32_t verification[8];
pdata[19] += nonce - pdata[19]; pdata[19] += nonce - pdata[19];
heavycoin_hash((unsigned char *)verification, (const unsigned char *)pdata, HEAVYCOIN_BLKHDR_SZ); heavycoin_hash((unsigned char *)verification, (const unsigned char *)pdata, blocklen);
if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) { if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) {
applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce); applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce);
} }

View File

@ -205,7 +205,7 @@ extern int scanhash_scrypt(int thr_id, uint32_t *pdata,
extern int scanhash_heavy(int thr_id, uint32_t *pdata, extern int scanhash_heavy(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done, uint32_t maxvote); unsigned long *hashes_done, uint32_t maxvote, int blocklen);
extern int scanhash_fugue256(int thr_id, uint32_t *pdata, extern int scanhash_fugue256(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce, const uint32_t *ptarget, uint32_t max_nonce,