From 6c8eff98c07bdf8f9e337b1f8fffe2951b0ba93b Mon Sep 17 00:00:00 2001 From: Christian Buchner Date: Sat, 3 May 2014 21:01:50 +0200 Subject: [PATCH] bump to revision v0.8 --- Makefile.am | 14 +- README.txt | 14 +- ccminer.vcxproj | 27 +-- ccminer.vcxproj.filters | 87 +++++----- configure.ac | 2 +- cpu-miner.c | 34 +++- cpuminer-config.h | 4 +- cuda_groestlcoin.cu | 3 - cuda_blake512.cu => heavy/cuda_blake512.cu | 157 +++++++----------- cuda_blake512.h => heavy/cuda_blake512.h | 3 +- cuda_combine.cu => heavy/cuda_combine.cu | 0 cuda_combine.h => heavy/cuda_combine.h | 0 .../cuda_groestl512.cu | 31 ++-- cuda_groestl512.h => heavy/cuda_groestl512.h | 2 +- cuda_hefty1.cu => heavy/cuda_hefty1.cu | 37 +++-- cuda_hefty1.h => heavy/cuda_hefty1.h | 2 +- cuda_keccak512.cu => heavy/cuda_keccak512.cu | 48 ++++-- cuda_keccak512.h => heavy/cuda_keccak512.h | 2 +- cuda_sha256.cu => heavy/cuda_sha256.cu | 38 +++-- cuda_sha256.h => heavy/cuda_sha256.h | 2 +- heavy.cu => heavy/heavy.cu | 63 ++++--- miner.h | 2 +- 22 files changed, 316 insertions(+), 256 deletions(-) rename cuda_blake512.cu => heavy/cuda_blake512.cu (63%) rename cuda_blake512.h => heavy/cuda_blake512.h (78%) rename cuda_combine.cu => heavy/cuda_combine.cu (100%) rename cuda_combine.h => heavy/cuda_combine.h (100%) rename cuda_groestl512.cu => heavy/cuda_groestl512.cu (98%) rename cuda_groestl512.h => heavy/cuda_groestl512.h (82%) rename cuda_hefty1.cu => heavy/cuda_hefty1.cu (93%) rename cuda_hefty1.h => heavy/cuda_hefty1.h (67%) rename cuda_keccak512.cu => heavy/cuda_keccak512.cu (88%) rename cuda_keccak512.h => heavy/cuda_keccak512.h (84%) rename cuda_sha256.cu => heavy/cuda_sha256.cu (86%) rename cuda_sha256.h => heavy/cuda_sha256.h (84%) rename heavy.cu => heavy/heavy.cu (90%) diff --git a/Makefile.am b/Makefile.am index 6543670..390e525 100644 --- a/Makefile.am +++ b/Makefile.am @@ -18,13 +18,13 @@ ccminer_SOURCES = elist.h miner.h compat.h \ compat/sys/time.h compat/getopt/getopt.h \ cpu-miner.c util.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c hefty1.c scrypt.c sha2.c \ sph/sph_blake.h sph/sph_groestl.h sph/sph_jh.h sph/sph_keccak.h sph/sph_skein.h sph/sph_types.h \ - heavy.cu \ - cuda_blake512.cu cuda_blake512.h \ - cuda_combine.cu cuda_combine.h \ - cuda_groestl512.cu cuda_groestl512.h \ - cuda_hefty1.cu cuda_hefty1.h \ - cuda_keccak512.cu cuda_keccak512.h \ - cuda_sha256.cu cuda_sha256.h \ + heavy/heavy.cu \ + heavy/cuda_blake512.cu heavy/cuda_blake512.h \ + heavy/cuda_combine.cu heavy/cuda_combine.h \ + heavy/cuda_groestl512.cu heavy/cuda_groestl512.h \ + heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \ + heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \ + heavy/cuda_sha256.cu heavy/cuda_sha256.h \ fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \ groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \ JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \ diff --git a/README.txt b/README.txt index fd05121..666101b 100644 --- a/README.txt +++ b/README.txt @@ -12,9 +12,8 @@ If you find this tool useful and like to support its continued VTC donation address: VrjeFzMgvteCGarLw85KivBzmsiH9fqp4a MAX donation address: mHrhQP9EFArechWxTFJ97s9D3jvcCvEEnt DOGE donation address: DT9ghsGmez6ojVdEZgvaZbT2Z3TruXG6yP - PANDA donation address: PvgtxJ2ZKaudRogCXfUMLXVaWUMcKQgRed - MRC donation address: 1Lxc4JPDpQRJB8BN4YwhmSQ3Rcu8gjj2Kd HVC donation address: HNN3PyyTMkDo4RkEjkWSGMwqia1yD8mwJN + GRS donation address: FmJKJAhvyHWPeEVeLQHefr2naqgWc9ABTM *************************************************************** >>> Introduction <<< @@ -35,6 +34,7 @@ its command line interface and options. -a, --algo=ALGO specify the algorithm to use heavy use to mine Heavycoin + mjollnir use to mine Mjollnircoin fugue256 use to mine Fuguecoin groestl use to mine Groestlcoin myr-gr use to mine Myriad-Groestl @@ -45,11 +45,12 @@ its command line interface and options. Alternatively give string names of your card like gtx780ti or gt640#2 (matching 2nd gt640 in the PC). + -f, --diff Divide difficulty by this factor (std is 1) \n\ + -v, --vote Heavycoin block vote (default: 512) -o, --url=URL URL of mining server (default: " DEF_RPC_URL ") -O, --userpass=U:P username:password pair for mining server -u, --user=USERNAME username for mining server -p, --pass=PASSWORD password for mining server - -v, --vote Heavycoin block vote (default: 512) --cert=FILE certificate for mining server using SSL -x, --proxy=[PROTOCOL://]HOST[:PORT] connect through a proxy -t, --threads=N number of miner threads (default: number of nVidia GPUs in your system) @@ -116,6 +117,13 @@ from your old clunkers. >>> RELEASE HISTORY <<< + May 3rd 2014 add the MjollnirCoin hash algorithm for the upcomin + MjollnirCoin relaunch. + + Add the -f (--diff) option to adjust the difficulty + e.g. for the erebor Dwarfpool myr-gr SaffronCoin pool. + Use -f 256 there. + May 1st 2014 adapt the Jackpot algorithms to changes made by the coin developers. We keep our unique nVidia advantage because we have a way to break up the divergence. diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 8c58c9d..32094af 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -264,14 +264,15 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - - - - - + + + + + + + @@ -290,16 +291,16 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)" - - - - - - - + + + + + + + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index f9ca3e1..c331f60 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -46,6 +46,12 @@ {7c2a98c6-064c-4a69-b803-d6f6ff5edd0b} + + {c3222908-22ba-4586-a637-6363f455b06d} + + + {3281db48-f394-49ea-a1ef-6ebd09828d50} + @@ -167,24 +173,6 @@ Header Files - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files\CUDA - - - Header Files\CUDA - Header Files @@ -230,29 +218,29 @@ Header Files\sph + + Header Files\CUDA\heavy + + + Header Files\CUDA\heavy + + + Header Files\CUDA\heavy + + + Header Files\CUDA\heavy + + + Header Files\CUDA\heavy + + + Header Files\CUDA\heavy + + + Header Files\CUDA + - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - - - Source Files\CUDA - Source Files\CUDA @@ -286,5 +274,26 @@ Source Files\CUDA\quark + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + \ No newline at end of file diff --git a/configure.ac b/configure.ac index 3b4e438..9fa301d 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2014.05.01]) +AC_INIT([ccminer], [2014.05.03]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 784aa1b..933b443 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -47,6 +47,7 @@ #define PROGRAM_NAME "minerd" #define LP_SCANTIME 60 #define HEAVYCOIN_BLKHDR_SZ 84 +#define MNR_BLKHDR_SZ 80 // from heavy.cu #ifdef __cplusplus @@ -121,6 +122,7 @@ struct workio_cmd { typedef enum { ALGO_HEAVY, /* Heavycoin hash */ + ALGO_MJOLLNIR, /* Mjollnir hash */ ALGO_FUGUE256, /* Fugue256 */ ALGO_GROESTL, ALGO_MYR_GR, @@ -129,6 +131,7 @@ typedef enum { static const char *algo_names[] = { "heavy", + "mjollnir", "fugue256", "groestl", "myr-gr", @@ -154,6 +157,7 @@ static json_t *opt_config; static const bool opt_time = true; static sha256_algos opt_algo = ALGO_HEAVY; static int opt_n_threads = 0; +static double opt_difficulty = 1; // CH bool opt_trust_pool = false; uint16_t opt_vote = 9999; static int num_processors; @@ -195,6 +199,7 @@ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ fugue256 Fuguecoin hash\n\ heavy Heavycoin hash\n\ + mjollnir Mjollnircoin hash\n\ groestl Groestlcoin hash\n\ myr-gr Myriad-Groestl hash\n\ jackpot Jackpot hash\n\ @@ -244,7 +249,7 @@ static char const short_options[] = #ifdef HAVE_SYSLOG_H "S" #endif - "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:mv:"; + "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:Vd:f:mv:"; static struct option const options[] = { { "algo", 1, NULL, 'a' }, @@ -277,6 +282,7 @@ static struct option const options[] = { { "userpass", 1, NULL, 'O' }, { "version", 0, NULL, 'V' }, { "devices", 1, NULL, 'd' }, + { "diff", 1, NULL, 'f' }, { 0, 0, 0, 0 } }; @@ -684,7 +690,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); /* Generate merkle root */ - if (opt_algo == ALGO_HEAVY) + if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR) heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL) @@ -694,7 +700,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) for (i = 0; i < sctx->job.merkle_count; i++) { memcpy(merkle_root + 32, sctx->job.merkle[i], 32); - if (opt_algo == ALGO_HEAVY) + if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR) heavycoin_hash(merkle_root, merkle_root, 64); else sha256d(merkle_root, merkle_root, 64); @@ -738,11 +744,11 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) } if (opt_algo == ALGO_JACKPOT) - diff_to_target(work->target, sctx->job.diff / 65536.0); + diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty)); else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL) - diff_to_target(work->target, sctx->job.diff / 256.0); + diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty)); else - diff_to_target(work->target, sctx->job.diff); + diff_to_target(work->target, sctx->job.diff / opt_difficulty); } static void *miner_thread(void *userdata) @@ -836,7 +842,12 @@ static void *miner_thread(void *userdata) case ALGO_HEAVY: rc = scanhash_heavy(thr_id, work.data, work.target, - max_nonce, &hashes_done, work.maxvote); + max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ); + break; + + case ALGO_MJOLLNIR: + rc = scanhash_heavy(thr_id, work.data, work.target, + max_nonce, &hashes_done, 0, MNR_BLKHDR_SZ); break; case ALGO_FUGUE256: @@ -1112,6 +1123,7 @@ static void parse_arg (int key, char *arg) { char *p; int v, i; + double d; switch(key) { case 'a': @@ -1309,6 +1321,12 @@ static void parse_arg (int key, char *arg) } } break; + case 'f': // CH - Divisor for Difficulty + d = atof(arg); + if (d == 0) /* sanity check */ + show_usage_and_exit(1); + opt_difficulty = d; + break; case 'V': show_version_and_exit(); case 'h': @@ -1404,7 +1422,7 @@ static void signal_handler(int sig) } #endif -#define PROGRAM_VERSION "0.7" +#define PROGRAM_VERSION "0.8" int main(int argc, char *argv[]) { struct thr_info *thr; diff --git a/cpuminer-config.h b/cpuminer-config.h index 61cbc3a..31f94fc 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -152,7 +152,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 2014.05.01" +#define PACKAGE_STRING "ccminer 2014.05.03" /* Define to the one symbol short name of this package. */ #undef PACKAGE_TARNAME @@ -161,7 +161,7 @@ #undef PACKAGE_URL /* Define to the version of this package. */ -#define PACKAGE_VERSION "2014.05.01" +#define PACKAGE_VERSION "2014.05.03" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu index 258f11b..e153e5c 100644 --- a/cuda_groestlcoin.cu +++ b/cuda_groestlcoin.cu @@ -16,9 +16,6 @@ extern int device_map[8]; // aus heavy.cu extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); -// aus driver.c -extern "C" void set_device(int device); - // Folgende Definitionen später durch header ersetzen typedef unsigned char uint8_t; typedef unsigned int uint32_t; diff --git a/cuda_blake512.cu b/heavy/cuda_blake512.cu similarity index 63% rename from cuda_blake512.cu rename to heavy/cuda_blake512.cu index 013b7e1..dd1c737 100644 --- a/cuda_blake512.cu +++ b/heavy/cuda_blake512.cu @@ -17,8 +17,8 @@ extern uint32_t *d_nonceVector[8]; // globaler Speicher für unsere Ergebnisse uint32_t *d_hash5output[8]; -// die Message (116 Bytes) mit Padding zur Berechnung auf der GPU -__constant__ uint64_t c_PaddedMessage[16]; // padded message (84+32 bytes + padding) +// die Message (112 bzw. 116 Bytes) mit Padding zur Berechnung auf der GPU +__constant__ uint64_t c_PaddedMessage[16]; // padded message (80/84+32 bytes + padding) // ---------------------------- BEGIN CUDA blake512 functions ------------------------------------ @@ -44,10 +44,12 @@ const uint8_t host_sigma[16][16] = { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } }; +// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam. #define SWAP32(x) \ ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +// Diese Makros besser nur für Compile Time Konstanten verwenden. Sie sind langsam. #define SWAP64(x) \ ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ @@ -58,11 +60,11 @@ const uint8_t host_sigma[16][16] = (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) -__constant__ uint64_t c_SecondRound[16]; +__constant__ uint64_t c_SecondRound[15]; -const uint64_t host_SecondRound[16] = +const uint64_t host_SecondRound[15] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0,SWAP64(0x3A0) + 0,0,0,0,0,0,0,0,0,0,0,0,0,SWAP64(1),0 }; __constant__ uint64_t c_u512[16]; @@ -80,24 +82,22 @@ const uint64_t host_u512[16] = }; -#define ROTR(x,n) (((x)<<(64-n))|( (x)>>(n))) - #define G(a,b,c,d,e) \ v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ - v[d] = ROTR( v[d] ^ v[a],32); \ + v[d] = ROTR64( v[d] ^ v[a],32); \ v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c],25); \ + v[b] = ROTR64( v[b] ^ v[c],25); \ v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ - v[d] = ROTR( v[d] ^ v[a],16); \ + v[d] = ROTR64( v[d] ^ v[a],16); \ v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c],11); + v[b] = ROTR64( v[b] ^ v[c],11); -__device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 ) +template __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt, const uint8_t ((*sigma)[16]), const uint64_t *u512 ) { uint64_t v[16], m[16], i; #pragma unroll 16 - for( i = 0; i < 16; ++i ) m[i] = SWAP64(block[i]); + for( i = 0; i < 16; ++i ) m[i] = cuda_swab64(block[i]); #pragma unroll 8 for( i = 0; i < 8; ++i ) v[i] = h[i]; @@ -113,11 +113,11 @@ __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt /* don't xor t when the block is only padding */ if ( !nullt ) { - v[12] ^= 928; - v[13] ^= 928; + v[12] ^= 8*(BLOCKSIZE+32); + v[13] ^= 8*(BLOCKSIZE+32); } -#pragma unroll 16 +//#pragma unroll 16 for( i = 0; i < 16; ++i ) { /* column step */ @@ -136,49 +136,9 @@ __device__ void blake512_compress( uint64_t *h, const uint64_t *block, int nullt for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; } -// Endian Drehung für 32 Bit Typen -static __device__ uint32_t cuda_swab32(uint32_t x) -{ - return (((x << 24) & 0xff000000u) | ((x << 8) & 0x00ff0000u) - | ((x >> 8) & 0x0000ff00u) | ((x >> 24) & 0x000000ffu)); -} +#include "cuda_helper.h" -// Endian Drehung für 64 Bit Typen -static __device__ uint64_t cuda_swab64(uint64_t x) { - uint32_t h = (x >> 32); - uint32_t l = (x & 0xFFFFFFFFULL); - return (((uint64_t)cuda_swab32(l)) << 32) | ((uint64_t)cuda_swab32(h)); -} - -// das Hi Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t HIWORD(const uint64_t &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2hiint(__longlong_as_double(x)); -#else - return (uint32_t)(x >> 32); -#endif -} - -// das Hi Word in einem 64 Bit Typen ersetzen -static __device__ uint64_t REPLACE_HIWORD(const uint64_t &x, const uint32_t &y) { - return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32ULL); -} - -// das Lo Word aus einem 64 Bit Typen extrahieren -static __device__ uint32_t LOWORD(const uint64_t &x) { -#if __CUDA_ARCH__ >= 130 - return (uint32_t)__double2loint(__longlong_as_double(x)); -#else - return (uint32_t)(x & 0xFFFFFFFFULL); -#endif -} - -// das Lo Word in einem 64 Bit Typen ersetzen -static __device__ uint64_t REPLACE_LOWORD(const uint64_t &x, const uint32_t &y) { - return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); -} - -__global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) +template __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -211,40 +171,40 @@ __global__ void blake512_gpu_hash(int threads, uint32_t startNounce, void *outpu // die Nounce durch die thread-spezifische ersetzen buf[9] = REPLACE_HIWORD(buf[9], nounce); - // den thread-spezifischen Hefty1 hash einsetzen uint32_t *hefty = heftyHashes + 8 * hashPosition; - buf[10] = REPLACE_HIWORD(buf[10], hefty[0]); - buf[11] = REPLACE_LOWORD(buf[11], hefty[1]); - buf[11] = REPLACE_HIWORD(buf[11], hefty[2]); - buf[12] = REPLACE_LOWORD(buf[12], hefty[3]); - buf[12] = REPLACE_HIWORD(buf[12], hefty[4]); - buf[13] = REPLACE_LOWORD(buf[13], hefty[5]); - buf[13] = REPLACE_HIWORD(buf[13], hefty[6]); - buf[14] = REPLACE_LOWORD(buf[14], hefty[7]); + if (BLOCKSIZE == 84) { + // den thread-spezifischen Hefty1 hash einsetzen + // aufwändig, weil das nicht mit uint64_t Wörtern aligned ist. + buf[10] = REPLACE_HIWORD(buf[10], hefty[0]); + buf[11] = REPLACE_LOWORD(buf[11], hefty[1]); + buf[11] = REPLACE_HIWORD(buf[11], hefty[2]); + buf[12] = REPLACE_LOWORD(buf[12], hefty[3]); + buf[12] = REPLACE_HIWORD(buf[12], hefty[4]); + buf[13] = REPLACE_LOWORD(buf[13], hefty[5]); + buf[13] = REPLACE_HIWORD(buf[13], hefty[6]); + buf[14] = REPLACE_LOWORD(buf[14], hefty[7]); + } + else if (BLOCKSIZE == 80) { + buf[10] = MAKE_ULONGLONG(hefty[0], hefty[1]); + buf[11] = MAKE_ULONGLONG(hefty[2], hefty[3]); + buf[12] = MAKE_ULONGLONG(hefty[4], hefty[5]); + buf[13] = MAKE_ULONGLONG(hefty[6], hefty[7]); + } // erste Runde - blake512_compress( h, buf, 0, c_sigma, c_u512 ); - + blake512_compress( h, buf, 0, c_sigma, c_u512 ); + + // zweite Runde -#pragma unroll 16 - for (int i=0; i < 16; ++i) buf[i] = c_SecondRound[i]; - blake512_compress( h, buf, 1, c_sigma, c_u512 ); - +#pragma unroll 15 + for (int i=0; i < 15; ++i) buf[i] = c_SecondRound[i]; + buf[15] = SWAP64(8*(BLOCKSIZE+32)); // Blocksize in Bits einsetzen + blake512_compress( h, buf, 1, c_sigma, c_u512 ); + // Hash rauslassen -#if 0 - // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind - uint32_t *outHash = (uint32_t *)outputHash + 16 * hashPosition; -#pragma unroll 8 - for (int i=0; i < 8; ++i) { - outHash[2*i+0] = cuda_swab32( HIWORD(h[i]) ); - outHash[2*i+1] = cuda_swab32( LOWORD(h[i]) ); - } -#else - // in dieser Version passieren auch ein paar 64 Bit Shifts uint64_t *outHash = (uint64_t *)outputHash + 8 * hashPosition; #pragma unroll 8 for (int i=0; i < 8; ++i) outHash[i] = cuda_swab64( h[i] ); -#endif } } @@ -274,22 +234,30 @@ __host__ void blake512_cpu_init(int thr_id, int threads) cudaMalloc(&d_hash5output[thr_id], 16 * sizeof(uint32_t) * threads); } -__host__ void blake512_cpu_setBlock(void *pdata) +static int BLOCKSIZE = 84; + +__host__ void blake512_cpu_setBlock(void *pdata, int len) // data muss 84-Byte haben! // heftyHash hat 32-Byte { - // Message mit Padding für erste Runde bereitstellen unsigned char PaddedMessage[128]; - memcpy(PaddedMessage, pdata, 84); - memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen - memset(PaddedMessage+116, 0, 12); - PaddedMessage[116] = 0x80; - + if (len == 84) { + // Message mit Padding für erste Runde bereitstellen + memcpy(PaddedMessage, pdata, 84); + memset(PaddedMessage+84, 0, 32); // leeres Hefty Hash einfüllen + memset(PaddedMessage+116, 0, 12); + PaddedMessage[116] = 0x80; + } else if (len == 80) { + memcpy(PaddedMessage, pdata, 80); + memset(PaddedMessage+80, 0, 32); // leeres Hefty Hash einfüllen + memset(PaddedMessage+112, 0, 16); + PaddedMessage[112] = 0x80; + } // die Message (116 Bytes) ohne Padding zur Berechnung auf der GPU cudaMemcpyToSymbol( c_PaddedMessage, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + BLOCKSIZE = len; } - __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) { const int threadsperblock = 256; @@ -303,5 +271,8 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - blake512_gpu_hash<<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + if (BLOCKSIZE == 80) + blake512_gpu_hash<80><<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + else if (BLOCKSIZE == 84) + blake512_gpu_hash<84><<>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); } diff --git a/cuda_blake512.h b/heavy/cuda_blake512.h similarity index 78% rename from cuda_blake512.h rename to heavy/cuda_blake512.h index 48bd3ff..7e24973 100644 --- a/cuda_blake512.h +++ b/heavy/cuda_blake512.h @@ -2,7 +2,6 @@ #define _CUDA_BLAKE512_H void blake512_cpu_init(int thr_id, int threads); -void blake512_cpu_setBlock(void *pdata); +void blake512_cpu_setBlock(void *pdata, int len); void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce); - #endif diff --git a/cuda_combine.cu b/heavy/cuda_combine.cu similarity index 100% rename from cuda_combine.cu rename to heavy/cuda_combine.cu diff --git a/cuda_combine.h b/heavy/cuda_combine.h similarity index 100% rename from cuda_combine.h rename to heavy/cuda_combine.h diff --git a/cuda_groestl512.cu b/heavy/cuda_groestl512.cu similarity index 98% rename from cuda_groestl512.cu rename to heavy/cuda_groestl512.cu index 6875404..bf86105 100644 --- a/cuda_groestl512.cu +++ b/heavy/cuda_groestl512.cu @@ -676,7 +676,7 @@ __device__ void groestl512_perm_Q(uint32_t *a) } } -__global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) +template __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -706,7 +706,7 @@ __global__ void groestl512_gpu_hash(int threads, uint32_t startNounce, void *out uint32_t *heftyHash = &heftyHashes[8 * hashPosition]; #pragma unroll 8 for (int k=0; k<8; ++k) - message[21+k] = heftyHash[k]; + message[BLOCKSIZE/4+k] = heftyHash[k]; uint32_t g[32]; #pragma unroll 32 @@ -764,21 +764,27 @@ __host__ void groestl512_cpu_init(int thr_id, int threads) cudaMalloc(&d_hash4output[thr_id], 16 * sizeof(uint32_t) * threads); } -__host__ void groestl512_cpu_setBlock(void *data) - // data muss 84-Byte haben! +static int BLOCKSIZE = 84; + +__host__ void groestl512_cpu_setBlock(void *data, int len) + // data muss 80/84-Byte haben! // heftyHash hat 32-Byte { // Nachricht expandieren und setzen uint32_t msgBlock[32]; memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 84); + memcpy(&msgBlock[0], data, len); // Erweitere die Nachricht auf den Nachrichtenblock (padding) - // Unsere Nachricht hat 116 Byte - msgBlock[29] = 0x80; - msgBlock[31] = 0x01000000; - + // Unsere Nachricht hat 112/116 Byte + if (len == 84) { + msgBlock[29] = 0x80; + msgBlock[31] = 0x01000000; + } else if (len == 80) { + msgBlock[28] = 0x80; + msgBlock[31] = 0x01000000; + } // groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird // auf der GPU ausgeführt) @@ -796,6 +802,8 @@ __host__ void groestl512_cpu_setBlock(void *data) cudaMemcpyToSymbol( groestl_gpu_msg, msgBlock, 128); + + BLOCKSIZE = len; } __host__ void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) @@ -818,5 +826,8 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - groestl512_gpu_hash<<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + if (BLOCKSIZE == 84) + groestl512_gpu_hash<84><<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + else if (BLOCKSIZE == 80) + groestl512_gpu_hash<80><<>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); } diff --git a/cuda_groestl512.h b/heavy/cuda_groestl512.h similarity index 82% rename from cuda_groestl512.h rename to heavy/cuda_groestl512.h index 0e77f2f..0cdc13b 100644 --- a/cuda_groestl512.h +++ b/heavy/cuda_groestl512.h @@ -3,7 +3,7 @@ void groestl512_cpu_init(int thr_id, int threads); void groestl512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); -void groestl512_cpu_setBlock(void *data); +void groestl512_cpu_setBlock(void *data, int len); void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce); #endif \ No newline at end of file diff --git a/cuda_hefty1.cu b/heavy/cuda_hefty1.cu similarity index 93% rename from cuda_hefty1.cu rename to heavy/cuda_hefty1.cu index 239752f..db5fca3 100644 --- a/cuda_hefty1.cu +++ b/heavy/cuda_hefty1.cu @@ -2,21 +2,24 @@ #include "cuda_runtime.h" #include "device_launch_parameters.h" -// aus cpu-miner.c -extern int device_map[8]; - #include #include #define USE_SHARED 1 +// aus cpu-miner.c +extern int device_map[8]; + +// aus heavy.cu +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); + // Folgende Definitionen später durch header ersetzen typedef unsigned int uint32_t; typedef unsigned char uint8_t; typedef unsigned short uint16_t; // diese Struktur wird in der Init Funktion angefordert -static cudaDeviceProp props; +static cudaDeviceProp props[8]; // globaler Speicher für alle HeftyHashes aller Threads uint32_t *d_heftyHashes[8]; @@ -286,7 +289,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa for(int j=0;j<16;j++) { Absorb(sponge, regs[3] + regs[7]); - hefty_gpu_round(regs, W2[j], heftyLookUp(j + 16 * (k+1)), sponge); + hefty_gpu_round(regs, W2[j], heftyLookUp(j + ((k+1)<<4)), sponge); } #pragma unroll 16 for(int j=0;j<16;j++) @@ -299,7 +302,7 @@ __global__ void hefty_gpu_hash(int threads, uint32_t startNounce, void *outputHa #pragma unroll 8 for(int k=0;k<8;k++) - ((uint32_t*)outputHash)[8*thread+k] = SWAB32(hash[k]); + ((uint32_t*)outputHash)[(thread<<3)+k] = SWAB32(hash[k]); } } @@ -308,7 +311,7 @@ __host__ void hefty_cpu_init(int thr_id, int threads) { cudaSetDevice(device_map[thr_id]); - cudaGetDeviceProperties(&props, device_map[thr_id]); + cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); // Kopiere die Hash-Tabellen in den GPU-Speicher cudaMemcpyToSymbol( hefty_gpu_constantTable, @@ -319,16 +322,21 @@ __host__ void hefty_cpu_init(int thr_id, int threads) cudaMalloc(&d_heftyHashes[thr_id], 8 * sizeof(uint32_t) * threads); } -__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data) - // data muss 84-Byte haben! +__host__ void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len) +// data muss 80/84-Byte haben! { // Nachricht expandieren und setzen uint32_t msgBlock[32]; memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 84); - msgBlock[21] |= 0x80; - msgBlock[31] = 672; // bitlen + memcpy(&msgBlock[0], data, len); + if (len == 84) { + msgBlock[21] |= 0x80; + msgBlock[31] = 672; // bitlen + } else if (len == 80) { + msgBlock[20] |= 0x80; + msgBlock[31] = 640; // bitlen + } for(int i=0;i<31;i++) // Byteorder drehen msgBlock[i] = SWAB32(msgBlock[i]); @@ -395,7 +403,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) { // Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, // alle anderen mit 512 Threads. - int threadsperblock = (props.major >= 3) ? 768 : 512; + int threadsperblock = (props[thr_id].major >= 3) ? 768 : 512; // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); @@ -411,4 +419,7 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); hefty_gpu_hash<<>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); + + // Strategisches Sleep Kommando zur Senkung der CPU Last + MyStreamSynchronize(NULL, 0, thr_id); } diff --git a/cuda_hefty1.h b/heavy/cuda_hefty1.h similarity index 67% rename from cuda_hefty1.h rename to heavy/cuda_hefty1.h index 9e72d3d..17b196c 100644 --- a/cuda_hefty1.h +++ b/heavy/cuda_hefty1.h @@ -2,7 +2,7 @@ #define _CUDA_HEFTY1_H void hefty_cpu_hash(int thr_id, int threads, int startNounce); -void hefty_cpu_setBlock(int thr_id, int threads, void *data); +void hefty_cpu_setBlock(int thr_id, int threads, void *data, int len); void hefty_cpu_init(int thr_id, int threads); #endif \ No newline at end of file diff --git a/cuda_keccak512.cu b/heavy/cuda_keccak512.cu similarity index 88% rename from cuda_keccak512.cu rename to heavy/cuda_keccak512.cu index 13e5255..9585793 100644 --- a/cuda_keccak512.cu +++ b/heavy/cuda_keccak512.cu @@ -16,6 +16,8 @@ extern uint32_t *d_nonceVector[8]; // globaler Speicher für unsere Ergebnisse uint32_t *d_hash3output[8]; +extern uint32_t *d_hash4output[8]; +extern uint32_t *d_hash5output[8]; // der Keccak512 State nach der ersten Runde (72 Bytes) __constant__ uint64_t c_State[25]; @@ -25,7 +27,7 @@ __constant__ uint32_t c_PaddedMessage2[18]; // 44 bytes of remaining message (No // ---------------------------- BEGIN CUDA keccak512 functions ------------------------------------ -#define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) +#include "cuda_helper.h" #define U32TO64_LE(p) \ (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) @@ -145,7 +147,7 @@ keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_const } // Die Hash-Funktion -__global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) +template __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -171,7 +173,7 @@ __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outp msgBlock[1] = nounce; // den individuellen Hefty1 Hash einsetzen - mycpy32(&msgBlock[3], &heftyHashes[8 * hashPosition]); + mycpy32(&msgBlock[(BLOCKSIZE-72)/sizeof(uint32_t)], &heftyHashes[8 * hashPosition]); // den Block einmal gut durchschütteln keccak_block(keccak_gpu_state, msgBlock, c_keccak_round_constants); @@ -184,7 +186,6 @@ __global__ void keccak512_gpu_hash(int threads, uint32_t startNounce, void *outp U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); } - // und ins Global Memory rausschreiben #pragma unroll 16 for(int k=0;k<16;k++) @@ -217,38 +218,49 @@ __host__ void keccak512_cpu_init(int thr_id, int threads) // --------------- END keccak512 CPU version from scrypt-jane code -------------------- -__host__ void keccak512_cpu_setBlock(void *data) - // data muss 84-Byte haben! +static int BLOCKSIZE = 84; + +__host__ void keccak512_cpu_setBlock(void *data, int len) + // data muss 80 oder 84-Byte haben! // heftyHash hat 32-Byte { // CH // state init uint64_t keccak_cpu_state[25]; - memset(keccak_cpu_state, 0, 200); + memset(keccak_cpu_state, 0, sizeof(keccak_cpu_state)); + + // erste Runde + keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants); + + // state kopieren + cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); // keccak hat 72-Byte blöcke, d.h. in unserem Fall zwei Blöcke // zu jeweils uint32_t msgBlock[18]; memset(msgBlock, 0, 18 * sizeof(uint32_t)); - // kopiere die Daten rein (aber nur alles nach Bit 72) - memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12); + // kopiere die restlichen Daten rein (aber nur alles nach Byte 72) + if (len == 84) + memcpy(&msgBlock[0], &((uint8_t*)data)[72], 12); + else if (len == 80) + memcpy(&msgBlock[0], &((uint8_t*)data)[72], 8); // Nachricht abschließen - msgBlock[11] = 0x01; + if (len == 84) + msgBlock[11] = 0x01; + else if (len == 80) + msgBlock[10] = 0x01; msgBlock[17] = 0x80000000; - // erste Runde - keccak_block((uint64_t*)&keccak_cpu_state, (const uint32_t*)data, host_keccak_round_constants); - // Message 2 ins Constant Memory kopieren (die variable Nonce und // der Hefty1 Anteil muss aber auf der GPU erst noch ersetzt werden) cudaMemcpyToSymbol( c_PaddedMessage2, msgBlock, 18*sizeof(uint32_t), 0, cudaMemcpyHostToDevice ); - // state kopieren - cudaMemcpyToSymbol( c_State, keccak_cpu_state, 25*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + BLOCKSIZE = len; } + __host__ void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) { // Hefty1 Hashes kopieren @@ -268,6 +280,8 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) size_t shared_size = 0; // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - keccak512_gpu_hash<<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + if (BLOCKSIZE==84) + keccak512_gpu_hash<84><<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + else if (BLOCKSIZE==80) + keccak512_gpu_hash<80><<>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); } diff --git a/cuda_keccak512.h b/heavy/cuda_keccak512.h similarity index 84% rename from cuda_keccak512.h rename to heavy/cuda_keccak512.h index 003f40f..1182447 100644 --- a/cuda_keccak512.h +++ b/heavy/cuda_keccak512.h @@ -2,7 +2,7 @@ #define _CUDA_KECCAK512_H void keccak512_cpu_init(int thr_id, int threads); -void keccak512_cpu_setBlock(void *data); +void keccak512_cpu_setBlock(void *data, int len); void keccak512_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce); diff --git a/cuda_sha256.cu b/heavy/cuda_sha256.cu similarity index 86% rename from cuda_sha256.cu rename to heavy/cuda_sha256.cu index b26021d..404a2a2 100644 --- a/cuda_sha256.cu +++ b/heavy/cuda_sha256.cu @@ -47,7 +47,7 @@ uint32_t sha256_cpu_constantTable[] = { #define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) // Die Hash-Funktion -__global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) +template __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputHash, uint32_t *heftyHashes, uint32_t *nonceVector) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -82,11 +82,10 @@ __global__ void sha256_gpu_hash(int threads, uint32_t startNounce, void *outputH uint32_t offset = 8 * (blockDim.x * blockIdx.x + threadIdx.x); #pragma unroll 8 for(int k=0;k<8;k++) - W1[5+k] = heftyHashes[offset + k]; - + W1[((BLOCKSIZE-64)/4)+k] = heftyHashes[offset + k]; #pragma unroll 8 - for (int i=5; i <5+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;) + for (int i=((BLOCKSIZE-64)/4); i < ((BLOCKSIZE-64)/4)+8; ++i) W1[i] = SWAB32(W1[i]); // die Hefty1 Hashes brauchen eine Drehung ;) W1[3] = SWAB32(nounce); // Progress W1 @@ -178,18 +177,26 @@ __host__ void sha256_cpu_init(int thr_id, int threads) cudaMalloc(&d_hash2output[thr_id], 8 * sizeof(uint32_t) * threads); } -__host__ void sha256_cpu_setBlock(void *data) - // data muss 84-Byte haben! +static int BLOCKSIZE = 84; + +__host__ void sha256_cpu_setBlock(void *data, int len) + // data muss 80/84-Byte haben! // heftyHash hat 32-Byte { // Nachricht expandieren und setzen uint32_t msgBlock[32]; memset(msgBlock, 0, sizeof(uint32_t) * 32); - memcpy(&msgBlock[0], data, 84); - memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen - msgBlock[29] |= 0x80; - msgBlock[31] = 928; // bitlen + memcpy(&msgBlock[0], data, len); + if (len == 84) { + memset(&msgBlock[21], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen + msgBlock[29] |= 0x80; + msgBlock[31] = 928; // bitlen + } else if (len == 80) { + memset(&msgBlock[20], 0, 32); // vorläufig Nullen anstatt der Hefty1 Hashes einfüllen + msgBlock[28] |= 0x80; + msgBlock[31] = 896; // bitlen + } for(int i=0;i<31;i++) // Byteorder drehen msgBlock[i] = SWAB32(msgBlock[i]); @@ -209,7 +216,7 @@ __host__ void sha256_cpu_setBlock(void *data) uint32_t hash[8]; // pre - for (int k=0; k < 8; k++) + for (int k=0; k < 8; k++) { regs[k] = sha256_cpu_hashTable[k]; hash[k] = regs[k]; @@ -242,6 +249,8 @@ __host__ void sha256_cpu_setBlock(void *data) cudaMemcpyToSymbol( sha256_gpu_blockHeader, &msgBlock[16], 64); + + BLOCKSIZE = len; } __host__ void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy) @@ -263,6 +272,9 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) size_t shared_size = 0; // fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); - - sha256_gpu_hash<<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + if (BLOCKSIZE == 84) + sha256_gpu_hash<84><<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + else if (BLOCKSIZE == 80) { + sha256_gpu_hash<80><<>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); + } } diff --git a/cuda_sha256.h b/heavy/cuda_sha256.h similarity index 84% rename from cuda_sha256.h rename to heavy/cuda_sha256.h index 9efd170..03385d1 100644 --- a/cuda_sha256.h +++ b/heavy/cuda_sha256.h @@ -2,7 +2,7 @@ #define _CUDA_SHA256_H void sha256_cpu_init(int thr_id, int threads); -void sha256_cpu_setBlock(void *data); +void sha256_cpu_setBlock(void *data, int len); void sha256_cpu_hash(int thr_id, int threads, int startNounce); void sha256_cpu_copyHeftyHash(int thr_id, int threads, void *heftyHashes, int copy); #endif diff --git a/heavy.cu b/heavy/heavy.cu similarity index 90% rename from heavy.cu rename to heavy/heavy.cu index 266275a..de86b42 100644 --- a/heavy.cu +++ b/heavy/heavy.cu @@ -22,12 +22,12 @@ #include "sph/sph_blake.h" #include "sph/sph_groestl.h" -#include "cuda_hefty1.h" -#include "cuda_sha256.h" -#include "cuda_keccak512.h" -#include "cuda_groestl512.h" -#include "cuda_blake512.h" -#include "cuda_combine.h" +#include "heavy/cuda_hefty1.h" +#include "heavy/cuda_sha256.h" +#include "heavy/cuda_keccak512.h" +#include "heavy/cuda_groestl512.h" +#include "heavy/cuda_blake512.h" +#include "heavy/cuda_combine.h" extern uint32_t *d_hash2output[8]; extern uint32_t *d_hash3output[8]; @@ -35,6 +35,7 @@ extern uint32_t *d_hash4output[8]; extern uint32_t *d_hash5output[8]; #define HEAVYCOIN_BLKHDR_SZ 84 +#define MNR_BLKHDR_SZ 80 // nonce-array für die threads uint32_t *d_nonceVector[8]; @@ -230,24 +231,29 @@ cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id) int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote); + unsigned long *hashes_done, uint32_t maxvote, int blocklen); extern "C" int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote) + unsigned long *hashes_done, uint32_t maxvote, int blocklen) { return scanhash_heavy_cpp(thr_id, pdata, - ptarget, max_nonce, hashes_done, maxvote); + ptarget, max_nonce, hashes_done, maxvote, blocklen); } +extern bool opt_benchmark; + int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote) + unsigned long *hashes_done, uint32_t maxvote, int blocklen) { // CUDA will process thousands of threads. const int throughput = 4096 * 128; + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x000000ff; + int rc = 0; uint32_t *hash = NULL; cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)); @@ -258,7 +264,6 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, memset(nrmCalls, 0, sizeof(int) * 6); uint32_t start_nonce = pdata[19]; - uint16_t *ext = (uint16_t *)&pdata[20]; // für jeden Hash ein individuelles Target erstellen basierend // auf dem höchsten Bit, das in ptarget gesetzt ist. @@ -282,26 +287,30 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, cudaMalloc(&d_nonceVector[thr_id], sizeof(uint32_t) * throughput); } + if (blocklen == HEAVYCOIN_BLKHDR_SZ) + { + uint16_t *ext = (uint16_t *)&pdata[20]; - if (opt_vote > maxvote) { - printf("Warning: Your block reward vote (%hu) exceeds " - "the maxvote reported by the pool (%hu).\n", - opt_vote, maxvote); - } + if (opt_vote > maxvote) { + printf("Warning: Your block reward vote (%hu) exceeds " + "the maxvote reported by the pool (%hu).\n", + opt_vote, maxvote); + } - if (opt_trust_pool && opt_vote > maxvote) { - printf("Warning: Capping block reward vote to maxvote reported by pool.\n"); - ext[0] = maxvote; + if (opt_trust_pool && opt_vote > maxvote) { + printf("Warning: Capping block reward vote to maxvote reported by pool.\n"); + ext[0] = maxvote; + } + else + ext[0] = opt_vote; } - else - ext[0] = opt_vote; // Setze die Blockdaten - hefty_cpu_setBlock(thr_id, throughput, pdata); - sha256_cpu_setBlock(pdata); - keccak512_cpu_setBlock(pdata); - groestl512_cpu_setBlock(pdata); - blake512_cpu_setBlock(pdata); + hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen); + sha256_cpu_setBlock(pdata, blocklen); + keccak512_cpu_setBlock(pdata, blocklen); + groestl512_cpu_setBlock(pdata, blocklen); + blake512_cpu_setBlock(pdata, blocklen); do { int i; @@ -370,7 +379,7 @@ int scanhash_heavy_cpp(int thr_id, uint32_t *pdata, if (fulltest(foundhash, ptarget)) { uint32_t verification[8]; pdata[19] += nonce - pdata[19]; - heavycoin_hash((unsigned char *)verification, (const unsigned char *)pdata, HEAVYCOIN_BLKHDR_SZ); + heavycoin_hash((unsigned char *)verification, (const unsigned char *)pdata, blocklen); if (memcmp(verification, foundhash, 8*sizeof(uint32_t))) { applog(LOG_ERR, "hash for nonce=$%08X does not validate on CPU!\n", nonce); } diff --git a/miner.h b/miner.h index ffea67c..7a68b37 100644 --- a/miner.h +++ b/miner.h @@ -205,7 +205,7 @@ extern int scanhash_scrypt(int thr_id, uint32_t *pdata, extern int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, - unsigned long *hashes_done, uint32_t maxvote); + unsigned long *hashes_done, uint32_t maxvote, int blocklen); extern int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce,