Browse Source

Merge branch 'blake-dev' into blake

2upstream 1.4.2-tpruvot
Tanguy Pruvot 10 years ago
parent
commit
474ee97d6e
  1. 4
      Makefile.am
  2. 4
      README.txt
  3. 79
      blake32.cu
  4. 7
      ccminer.vcxproj
  5. 6
      ccminer.vcxproj.filters
  6. 2
      configure.ac
  7. 142
      cpu-miner.c
  8. 119
      crc32.c
  9. 2
      cuda_helper.h
  10. 9
      cuda_nist5.cu
  11. 8
      miner.h
  12. 600
      pentablake.cu
  13. 156
      quark/cuda_quark_blake512.cu
  14. 31
      quark/quarkcoin.cu
  15. 29
      util.c
  16. 18
      x11/x11.cu
  17. 4
      x15/x14.cu
  18. 31
      x15/x15.cu
  19. 18
      x17/x17.cu

4
Makefile.am

@ -16,7 +16,7 @@ bin_PROGRAMS = ccminer @@ -16,7 +16,7 @@ bin_PROGRAMS = ccminer
ccminer_SOURCES = elist.h miner.h compat.h \
compat/inttypes.h compat/stdbool.h compat/unistd.h \
compat/sys/time.h compat/getopt/getopt.h \
cpu-miner.c util.c hefty1.c scrypt.c \
cpu-miner.c util.c crc32.c hefty1.c scrypt.c \
hashlog.cpp \
heavy/heavy.cu \
heavy/cuda_blake512.cu heavy/cuda_blake512.h \
@ -33,7 +33,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -33,7 +33,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu quark/quarkcoin.cu quark/animecoin.cu \
quark/cuda_quark_compactionTest.cu \
cuda_nist5.cu blake32.cu \
cuda_nist5.cu blake32.cu pentablake.cu \
sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \

4
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccMiner release 1.4.1-tpruvot (Sep 06th 2014) - "Cached Blake"
ccMiner release 1.4.2-tpruvot (Sep 09th 2014) - "Pentablake"
---------------------------------------------------------------
***************************************************************
@ -35,6 +35,7 @@ TalkCoin @@ -35,6 +35,7 @@ TalkCoin
DarkCoin and other X11 coins
NEOS blake (256 14-rounds)
BlakeCoin (256 8-rounds)
Pentablake (Blake 512 x5)
where some of these coins have a VERY NOTABLE nVidia advantage
over competing AMD (OpenCL) implementations.
@ -65,6 +66,7 @@ its command line interface and options. @@ -65,6 +66,7 @@ its command line interface and options.
blake use to mine NEOS (Blake 256)
blakecoin use to mine Old Blake 256
nist5 use to mine TalkCoin
penta use to mine Joincoin / Pentablake
fresh use to mine Freshcoin
whirl use to mine Whirlcoin
x11 use to mine DarkCoin

79
blake32.cu

@ -15,6 +15,9 @@ extern "C" { @@ -15,6 +15,9 @@ extern "C" {
/* threads per block */
#define TPB 128
/* crc32.c */
extern "C" uint32_t crc32_u32t(const uint32_t *buf, size_t size);
extern "C" int blake256_rounds = 14;
/* hash by cpu with blake 256 */
@ -41,8 +44,6 @@ extern bool opt_n_threads; @@ -41,8 +44,6 @@ extern bool opt_n_threads;
extern bool opt_benchmark;
extern int device_map[8];
uint32_t crc32(const uint32_t *buf, size_t size);
__constant__
static uint32_t __align__(32) c_Target[8];
@ -331,7 +332,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt @@ -331,7 +332,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
blake256_cpu_setBlock_80(pdata, ptarget);
#if USE_CACHE
crcsum = crc32(pdata, 64);
crcsum = crc32_u32t(pdata, 64);
#endif
do {
@ -360,7 +361,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt @@ -360,7 +361,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
be32enc(&endiandata[19], extra_results[0]);
blake256hash(vhashcpu, endiandata, blakerounds);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
applog(LOG_NOTICE, "GPU found more than one result yippee!");
applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!");
rc = 2;
} else {
extra_results[0] = MAXU;
}
@ -379,9 +381,14 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt @@ -379,9 +381,14 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
}
}
if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
pdata[19] = max_nonce - first_nonce + 1;
break;
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
} while (!work_restart[thr_id].restart);
exit_scan:
*hashes_done = pdata[19] - first_nonce + 1;
@ -394,66 +401,6 @@ exit_scan: @@ -394,66 +401,6 @@ exit_scan:
}
#endif
// wait proper end of all threads
cudaDeviceSynchronize();
//cudaDeviceSynchronize();
return rc;
}
static uint32_t crc32_tab[] = {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
};
uint32_t crc32(const uint32_t *buf, size_t size)
{
const uint8_t *p;
uint32_t crc = 0;
p = (uint8_t *) buf;
crc = crc ^ ~0U;
while (size--)
crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
return crc ^ ~0U;
}

7
ccminer.vcxproj

@ -241,6 +241,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command> @@ -241,6 +241,7 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<TreatWChar_tAsBuiltInType>false</TreatWChar_tAsBuiltInType>
<Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
</ClCompile>
<ClCompile Include="crc32.c" />
<ClCompile Include="fuguecoin.cpp" />
<ClCompile Include="groestlcoin.cpp" />
<ClCompile Include="hashlog.cpp" />
@ -404,6 +405,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command> @@ -404,6 +405,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
<FastMath>true</FastMath>
</CudaCompile>
<CudaCompile Include="pentablake.cu">
<MaxRegCount>80</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-O2 -dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>
<FastMath>true</FastMath>
</CudaCompile>
<CudaCompile Include="quark\animecoin.cu">
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options=-O2 %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)'=='Debug'">%(AdditionalOptions)</AdditionalOptions>

6
ccminer.vcxproj.filters

@ -96,6 +96,9 @@ @@ -96,6 +96,9 @@
<ClCompile Include="cpu-miner.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="crc32.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="hefty1.c">
<Filter>Source Files</Filter>
</ClCompile>
@ -442,5 +445,8 @@ @@ -442,5 +445,8 @@
<CudaCompile Include="blake32.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="pentablake.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup>
</Project>

2
configure.ac

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.09.06])
AC_INIT([ccminer], [2014.09.09])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

142
cpu-miner.c

@ -136,8 +136,9 @@ typedef enum { @@ -136,8 +136,9 @@ typedef enum {
ALGO_JACKPOT,
ALGO_MJOLLNIR, /* Mjollnir hash */
ALGO_MYR_GR,
ALGO_QUARK,
ALGO_NIST5,
ALGO_PENTABLAKE,
ALGO_QUARK,
ALGO_WHC,
ALGO_X11,
ALGO_X13,
@ -159,6 +160,7 @@ static const char *algo_names[] = { @@ -159,6 +160,7 @@ static const char *algo_names[] = {
"mjollnir",
"myr-gr",
"nist5",
"penta",
"quark",
"whirl",
"x11",
@ -170,7 +172,6 @@ static const char *algo_names[] = { @@ -170,7 +172,6 @@ static const char *algo_names[] = {
};
bool opt_debug = false;
bool opt_debug_rpc = false;
bool opt_protocol = false;
bool opt_benchmark = false;
bool want_longpoll = true;
@ -179,7 +180,7 @@ bool want_stratum = true; @@ -179,7 +180,7 @@ bool want_stratum = true;
bool have_stratum = false;
static bool submit_old = false;
bool use_syslog = false;
bool use_colors = false;
bool use_colors = true;
static bool opt_background = false;
bool opt_quiet = false;
static int opt_retries = -1;
@ -242,6 +243,7 @@ Options:\n\ @@ -242,6 +243,7 @@ Options:\n\
mjollnir Mjollnircoin hash\n\
myr-gr Myriad-Groestl hash\n\
nist5 NIST5 (TalkCoin) hash\n\
penta Pentablake hash (5x Blake 512)\n\
quark Quark hash\n\
whirl Whirlcoin (old whirlpool)\n\
x11 X11 (DarkCoin) hash\n\
@ -437,7 +439,7 @@ static int share_result(int result, const char *reason) @@ -437,7 +439,7 @@ static int share_result(int result, const char *reason)
(result ? CL_GRN "yay!!!" : CL_RED "booooo")
: (result ? "(yay!!!)" : "(booooo)"));
if (reason && !opt_quiet) {
if (reason) {
applog(LOG_WARNING, "reject reason: %s", reason);
if (strncmp(reason, "low difficulty share", 20) == 0) {
opt_difficulty = (opt_difficulty * 2.0) / 3.0;
@ -457,11 +459,14 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -457,11 +459,14 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
bool rc = false;
/* pass if the previous hash is not the current previous hash */
pthread_mutex_lock(&g_work_lock);
if (memcmp(work->data + 1, g_work.data + 1, 32)) {
pthread_mutex_unlock(&g_work_lock);
if (opt_debug)
applog(LOG_DEBUG, "DEBUG: stale work detected, discarding");
return true;
}
pthread_mutex_unlock(&g_work_lock);
if (have_stratum) {
uint32_t sent;
@ -544,10 +549,6 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -544,10 +549,6 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
json_decref(val);
}
if (opt_debug_rpc) {
applog(LOG_DEBUG, "submit: %s", s);
}
rc = true;
out:
@ -786,13 +787,20 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -786,13 +787,20 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size);
/* Generate merkle root */
if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR)
heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
else
if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_WHC || opt_algo == ALGO_BLAKECOIN)
SHA256((unsigned char*)sctx->job.coinbase, sctx->job.coinbase_size, (unsigned char*)merkle_root);
else
sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
switch (opt_algo) {
case ALGO_HEAVY:
case ALGO_MJOLLNIR:
heavycoin_hash(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
break;
case ALGO_FUGUE256:
case ALGO_GROESTL:
case ALGO_BLAKECOIN:
case ALGO_WHC:
SHA256((uint8_t*)sctx->job.coinbase, sctx->job.coinbase_size, (uint8_t*)merkle_root);
break;
default:
sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size);
}
for (i = 0; i < sctx->job.merkle_count; i++) {
memcpy(merkle_root + 32, sctx->job.merkle[i], 32);
@ -864,7 +872,9 @@ static void *miner_thread(void *userdata) @@ -864,7 +872,9 @@ static void *miner_thread(void *userdata)
uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - (thr_id + 1);
unsigned char *scratchbuf = NULL;
bool work_done = false;
bool extrajob = false;
char s[16];
int rc = 0;
memset(&work, 0, sizeof(work)); // prevent work from being used uninitialized
@ -891,20 +901,31 @@ static void *miner_thread(void *userdata) @@ -891,20 +901,31 @@ static void *miner_thread(void *userdata)
struct timeval tv_start, tv_end, diff;
int64_t max64;
uint64_t umax64;
int rc;
// &work.data[19]
int wcmplen = 76;
uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
if (have_stratum) {
while (time(NULL) >= (g_work_time + opt_scantime) && !work_done)
usleep(500*1000);
work_done = false;
pthread_mutex_lock(&g_work_lock);
uint32_t sleeptime = 0;
while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
usleep(100*1000);
if (sleeptime > 4) {
extrajob = true;
break;
}
sleeptime++;
}
if (sleeptime && opt_debug && !opt_quiet)
applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
if ((*nonceptr) >= end_nonce)
pthread_mutex_lock(&g_work_lock);
extrajob |= work_done;
if ((*nonceptr) >= end_nonce || extrajob) {
work_done = false;
extrajob = false;
stratum_gen_work(&stratum, &g_work);
}
} else {
int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
/* obtain new work from internal workio thread */
@ -920,34 +941,54 @@ static void *miner_thread(void *userdata) @@ -920,34 +941,54 @@ static void *miner_thread(void *userdata)
g_work_time = time(NULL);
}
}
#if 0
if (!opt_benchmark && g_work.xnonce2_len == 0) {
applog(LOG_ERR, "work data not read yet");
extrajob = true;
work_done = true;
sleep(1);
continue;
}
#endif
if (rc > 1) {
/* if we found more than one on last loop */
/* todo: handle an array to get them directly */
pthread_mutex_unlock(&g_work_lock);
goto continue_scan;
}
if (memcmp(work.target, g_work.target, sizeof(work.target))) {
if (opt_debug) {
applog(LOG_DEBUG, "job %s target change:", g_work.job_id);
applog_hash((uint8_t*) work.target);
applog_compare_hash((uint8_t*) g_work.target, (uint8_t*) work.target);
}
memcpy(work.target, g_work.target, sizeof(work.target));
(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
/* on new target, ignoring nonce, clear sent data (hashlog) */
if (memcmp(work.target, g_work.target, sizeof(work.target))) {
hashlog_purge_job(work.job_id);
}
}
if (memcmp(work.data, g_work.data, wcmplen)) {
if (opt_debug) {
applog(LOG_DEBUG, "job %s work updated", g_work.job_id);
for (int n=0; n<wcmplen; n+=8) {
for (int n=0; n <= (wcmplen-8); n+=8) {
if (memcmp(work.data + n, g_work.data + n, 8)) {
applog(LOG_DEBUG, "diff detected at offset %d", n);
applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n);
applog_hash((uint8_t*) work.data + n);
applog_hash((uint8_t*) g_work.data + n);
applog_compare_hash((uint8_t*) g_work.data + n, (uint8_t*) work.data + n);
}
}
}
memcpy(&work, &g_work, sizeof(struct work));
(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
} else if (memcmp(work.target, g_work.target, sizeof(work.target))) {
if (opt_debug) {
applog(LOG_DEBUG, "job %s target change", g_work.job_id);
applog_hash((uint8_t*) work.target);
applog_hash((uint8_t*) g_work.target);
}
memcpy(work.target, g_work.target, sizeof(work.target));
(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
} else
(*nonceptr)++; //??
pthread_mutex_unlock(&g_work_lock);
work_restart[thr_id].restart = 0;
if (opt_debug)
applog(LOG_WARNING, "job %s %08x", g_work.job_id, (*nonceptr));
pthread_mutex_unlock(&g_work_lock);
/* adjust max_nonce to meet target scan time */
if (have_stratum)
@ -959,15 +1000,18 @@ static void *miner_thread(void *userdata) @@ -959,15 +1000,18 @@ static void *miner_thread(void *userdata)
max64 *= (int64_t)thr_hashrates[thr_id];
if (max64 <= 0) {
/* should not be set too high,
else you can miss multiple nounces */
switch (opt_algo) {
case ALGO_JACKPOT:
max64 = 0x1fffLL;
break;
case ALGO_BLAKECOIN:
max64 = 0x3ffffffLL;
break;
case ALGO_BLAKE:
/* based on the 750Ti hashrate (100kH) */
max64 = 0x3ffffffLL;
max64 = 0x1ffffffLL;
break;
default:
max64 = 0xfffffLL;
@ -1000,12 +1044,12 @@ static void *miner_thread(void *userdata) @@ -1000,12 +1044,12 @@ static void *miner_thread(void *userdata)
stall |= (start_nonce > range.scanned[0] && start_nonce < range.scanned[1]);
if (stall) {
if (opt_algo)
if (opt_debug && !opt_quiet)
applog(LOG_DEBUG, "job done, wait for a new one...");
work_restart[thr_id].restart = 1;
hashlog_purge_old();
// wait a bit for a new job...
sleep(1);
usleep(500*1000);
(*nonceptr) = end_nonce + 1;
work_done = true;
continue;
@ -1023,6 +1067,7 @@ static void *miner_thread(void *userdata) @@ -1023,6 +1067,7 @@ static void *miner_thread(void *userdata)
(*nonceptr) = start_nonce;
hashes_done = 0;
continue_scan:
gettimeofday(&tv_start, NULL);
/* scan nonces for a proof-of-work hash */
@ -1089,6 +1134,11 @@ static void *miner_thread(void *userdata) @@ -1089,6 +1134,11 @@ static void *miner_thread(void *userdata)
max_nonce, &hashes_done);
break;
case ALGO_PENTABLAKE:
rc = scanhash_pentablake(thr_id, work.data, work.target,
max_nonce, &hashes_done);
break;
case ALGO_WHC:
rc = scanhash_whc(thr_id, work.data, work.target,
max_nonce, &hashes_done);
@ -1133,8 +1183,11 @@ static void *miner_thread(void *userdata) @@ -1133,8 +1183,11 @@ static void *miner_thread(void *userdata)
timeval_subtract(&diff, &tv_end, &tv_start);
if (diff.tv_usec || diff.tv_sec) {
pthread_mutex_lock(&stats_lock);
thr_hashrates[thr_id] =
hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
if (diff.tv_sec + 1e-6 * diff.tv_usec > 0.0) {
thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
if (rc > 1)
thr_hashrates[thr_id] = (rc * hashes_done) / (diff.tv_sec + 1e-6 * diff.tv_usec);
}
pthread_mutex_unlock(&stats_lock);
}
if (!opt_quiet) {
@ -1342,7 +1395,6 @@ static void *stratum_thread(void *userdata) @@ -1342,7 +1395,6 @@ static void *stratum_thread(void *userdata)
pthread_mutex_lock(&g_work_lock);
stratum_gen_work(&stratum, &g_work);
time(&g_work_time);
pthread_mutex_unlock(&g_work_lock);
if (stratum.job.clean) {
if (!opt_quiet)
applog(LOG_BLUE, "%s send a new %s block %d", short_url, algo_names[opt_algo],
@ -1353,6 +1405,7 @@ static void *stratum_thread(void *userdata) @@ -1353,6 +1405,7 @@ static void *stratum_thread(void *userdata)
applog(LOG_BLUE, "%s send job %d for block %d", short_url,
strtoul(stratum.job.job_id, NULL, 16), stratum.bloc_height);
}
pthread_mutex_unlock(&g_work_lock);
}
if (!stratum_socket_full(&stratum, 120)) {
@ -1374,7 +1427,7 @@ out: @@ -1374,7 +1427,7 @@ out:
return NULL;
}
#define PROGRAM_VERSION "1.4.1"
#define PROGRAM_VERSION "1.4.2"
static void show_version_and_exit(void)
{
printf("%s v%s\n"
@ -1438,12 +1491,11 @@ static void parse_arg (int key, char *arg) @@ -1438,12 +1491,11 @@ static void parse_arg (int key, char *arg)
case 'C':
use_colors = true;
break;
case 'q':
opt_quiet = true;
break;
case 'D':
opt_debug = true;
opt_debug_rpc = true;
break;
case 'q':
opt_quiet = true;
break;
case 'p':
free(rpc_pass);

119
crc32.c

@ -0,0 +1,119 @@ @@ -0,0 +1,119 @@
/*-
* COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or
* code or tables extracted from it, as desired without restriction.
*
* First, the polynomial itself and its table of feedback terms. The
* polynomial is
* X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0
*
* Note that we take it "backwards" and put the highest-order term in
* the lowest-order bit. The X^32 term is "implied"; the LSB is the
* X^31 term, etc. The X^0 term (usually shown as "+1") results in
* the MSB being 1
*
* Note that the usual hardware shift register implementation, which
* is what we're using (we're merely optimizing it by doing eight-bit
* chunks at a time) shifts bits into the lowest-order term. In our
* implementation, that means shifting towards the right. Why do we
* do it this way? Because the calculated CRC must be transmitted in
* order from highest-order term to lowest-order term. UARTs transmit
* characters in order from LSB to MSB. By storing the CRC this way
* we hand it to the UART in the order low-byte to high-byte; the UART
* sends each low-bit to hight-bit; and the result is transmission bit
* by bit from highest- to lowest-order term without requiring any bit
* shuffling on our part. Reception works similarly
*
* The feedback terms table consists of 256, 32-bit entries. Notes
*
* The table can be generated at runtime if desired; code to do so
* is shown later. It might not be obvious, but the feedback
* terms simply represent the results of eight shift/xor opera
* tions for all combinations of data and CRC register values
*
* The values must be right-shifted by eight bits by the "updcrc
* logic; the shift must be unsigned (bring in zeroes). On some
* hardware you could probably optimize the shift in assembler by
* using byte-swap instructions
* polynomial $edb88320
*
*
* CRC32 code derived from work by Gary S. Brown.
*/
#include <stdlib.h>
#include <stdint.h>
static uint32_t crc32_tab[] = {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
};
/* Real CRC32 Function */
extern uint32_t crc32(uint32_t crc, const void *buf, size_t size)
{
const uint8_t *p;
p = buf;
crc = crc ^ ~0U;
while (size--)
crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
return crc ^ ~0U;
}
/* CRC32 Function simplified for ccminer */
extern uint32_t crc32_u32t(const uint32_t *buf, size_t size)
{
const uint8_t *p;
uint32_t crc = 0;
p = (uint8_t *) buf;
crc = crc ^ ~0U;
while (size--)
crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
return crc ^ ~0U;
}

2
cuda_helper.h

@ -12,6 +12,8 @@ @@ -12,6 +12,8 @@
#include <stdint.h>
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
extern __device__ __device_builtin__ void __syncthreads(void);
#ifndef __CUDA_ARCH__

9
cuda_nist5.cu

@ -5,9 +5,11 @@ extern "C" @@ -5,9 +5,11 @@ extern "C"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
}
#include "miner.h"
#include "cuda_helper.h"
}
// aus cpu-miner.c
extern int device_map[8];
@ -74,9 +76,6 @@ extern "C" void nist5hash(void *state, const void *input) @@ -74,9 +76,6 @@ extern "C" void nist5hash(void *state, const void *input)
memcpy(state, hash, 32);
}
extern bool opt_benchmark;
extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
@ -84,7 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, @@ -84,7 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t first_nonce = pdata[19];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
((uint32_t*)ptarget)[7] = 0x00FF;
const uint32_t Htarg = ptarget[7];

8
miner.h

@ -249,6 +249,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata, @@ -249,6 +249,10 @@ extern int scanhash_nist5(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_pentablake(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
extern int scanhash_whc(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done);
@ -284,8 +288,8 @@ struct work_restart { @@ -284,8 +288,8 @@ struct work_restart {
char padding[128 - sizeof(unsigned long)];
};
extern bool opt_benchmark;
extern bool opt_debug;
extern bool opt_debug_rpc;
extern bool opt_quiet;
extern bool opt_protocol;
extern int opt_timeout;
@ -417,6 +421,7 @@ size_t time2str(char* buf, time_t timer); @@ -417,6 +421,7 @@ size_t time2str(char* buf, time_t timer);
char* atime2str(time_t timer);
void applog_hash(unsigned char *hash);
void applog_compare_hash(unsigned char *hash, unsigned char *hash2);
void print_hash_tests(void);
void animehash(void *state, const void *input);
@ -428,6 +433,7 @@ unsigned int jackpothash(void *state, const void *input); @@ -428,6 +433,7 @@ unsigned int jackpothash(void *state, const void *input);
void groestlhash(void *state, const void *input);
void myriadhash(void *state, const void *input);
void nist5hash(void *state, const void *input);
void pentablakehash(void *output, const void *input);
void quarkhash(void *state, const void *input);
void wcoinhash(void *state, const void *input);
void x11hash(void *output, const void *input);

600
pentablake.cu

@ -0,0 +1,600 @@ @@ -0,0 +1,600 @@
/**
* Penta Blake-512 Cuda Kernel (Tested on SM 5.0)
*
* Tanguy Pruvot - Aug. 2014
*/
#include "miner.h"
extern "C" {
#include "sph/sph_blake.h"
#include <stdint.h>
#include <memory.h>
}
/* threads per block */
#define TPB 192
/* hash by cpu with blake 256 */
extern "C" void pentablakehash(void *output, const void *input)
{
unsigned char hash[128];
#define hashB hash + 64
sph_blake512_context ctx;
sph_blake512_init(&ctx);
sph_blake512(&ctx, input, 80);
sph_blake512_close(&ctx, hash);
sph_blake512(&ctx, hash, 64);
sph_blake512_close(&ctx, hashB);
sph_blake512(&ctx, hashB, 64);
sph_blake512_close(&ctx, hash);
sph_blake512(&ctx, hash, 64);
sph_blake512_close(&ctx, hashB);
sph_blake512(&ctx, hashB, 64);
sph_blake512_close(&ctx, hash);
memcpy(output, hash, 32);
}
#include "cuda_helper.h"
#define MAXU 0xffffffffU
// in cpu-miner.c
extern bool opt_n_threads;
extern bool opt_benchmark;
extern int device_map[8];
__constant__
static uint32_t __align__(32) c_Target[8];
__constant__
static uint64_t __align__(32) c_data[32];
static uint32_t *d_hash[8];
static uint32_t *d_resNounce[8];
static uint32_t *h_resNounce[8];
static uint32_t extra_results[2] = { MAXU, MAXU };
/* prefer uint32_t to prevent size conversions = speed +5/10 % */
__constant__
static uint32_t __align__(32) c_sigma[16][16];
const uint32_t host_sigma[16][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
};
__device__ __constant__
static const uint64_t __align__(32) c_IV512[8] = {
0x6a09e667f3bcc908ULL,
0xbb67ae8584caa73bULL,
0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL,
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL
};
__device__ __constant__
const uint64_t c_u512[16] =
{
0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL,
0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL,
0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL,
0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL,
0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL,
0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL,
0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL,
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};
#define G(a,b,c,d,x) { \
uint32_t idx1 = c_sigma[i][x]; \
uint32_t idx2 = c_sigma[i][x+1]; \
v[a] += (m[idx1] ^ c_u512[idx2]) + v[b]; \
v[d] = ROTR64(v[d] ^ v[a], 32); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 25); \
v[a] += (m[idx2] ^ c_u512[idx1]) + v[b]; \
v[d] = ROTR64(v[d] ^ v[a], 16); \
v[c] += v[d]; \
v[b] = ROTR64(v[b] ^ v[c], 11); \
}
// Hash-Padding
__device__ __constant__
static const uint64_t d_constHashPadding[8] = {
0x0000000000000080ull,
0,
0,
0,
0,
0x0100000000000000ull,
0,
0x0002000000000000ull
};
#if 0
__device__ __constant__
static const uint64_t __align__(32) c_Padding[16] = {
0, 0, 0, 0,
0x80000000ULL, 0, 0, 0,
0, 0, 0, 0,
0, 1, 0, 640,
};
__device__ static
void pentablake_compress(uint64_t *h, const uint64_t *block, const uint32_t T0)
{
uint64_t v[16], m[16];
m[0] = block[0];
m[1] = block[1];
m[2] = block[2];
m[3] = block[3];
for (uint32_t i = 4; i < 16; i++) {
m[i] = (T0 == 0x200) ? block[i] : c_Padding[i];
}
//#pragma unroll 8
for(uint32_t i = 0; i < 8; i++)
v[i] = h[i];
v[ 8] = c_u512[0];
v[ 9] = c_u512[1];
v[10] = c_u512[2];
v[11] = c_u512[3];
v[12] = xor1(c_u512[4], T0);
v[13] = xor1(c_u512[5], T0);
v[14] = c_u512[6];
v[15] = c_u512[7];
for (uint32_t i = 0; i < 16; i++) {
/* column step */
G(0, 4, 0x8, 0xC, 0x0);
G(1, 5, 0x9, 0xD, 0x2);
G(2, 6, 0xA, 0xE, 0x4);
G(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
G(0, 5, 0xA, 0xF, 0x8);
G(1, 6, 0xB, 0xC, 0xA);
G(2, 7, 0x8, 0xD, 0xC);
G(3, 4, 0x9, 0xE, 0xE);
}
//#pragma unroll 16
for (uint32_t i = 0; i < 16; i++) {
uint32_t j = i % 8;
h[j] ^= v[i];
}
}
__global__
void pentablake_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nounce = startNounce + thread;
uint64_t h[8];
#pragma unroll
for(int i=0; i<8; i++) {
h[i] = c_IV512[i];
}
uint64_t ending[4];
ending[0] = c_data[16];
ending[1] = c_data[17];
ending[2] = c_data[18];
ending[3] = nounce; /* our tested value */
pentablake_compress(h, ending, 640);
// -----------------------------------
for (int r = 0; r < 4; r++) {
uint64_t data[8];
for (int i = 0; i < 7; i++) {
data[i] = h[i];
}
pentablake_compress(h, data, 512); /* todo: use h,h when ok*/
}
}
}
#endif
__device__ static
void pentablake_compress(uint64_t *h, const uint64_t *block, const uint64_t T0)
{
uint64_t v[16], m[16], i;
#pragma unroll 16
for(i = 0; i < 16; i++) {
m[i] = cuda_swab64(block[i]);
}
#pragma unroll 8
for (i = 0; i < 8; i++)
v[i] = h[i];
v[ 8] = c_u512[0];
v[ 9] = c_u512[1];
v[10] = c_u512[2];
v[11] = c_u512[3];
v[12] = c_u512[4] ^ T0;
v[13] = c_u512[5] ^ T0;
v[14] = c_u512[6];
v[15] = c_u512[7];
//#pragma unroll 16
for( i = 0; i < 16; i++)
{
/* column step */
G(0, 4, 0x8, 0xC, 0x0);
G(1, 5, 0x9, 0xD, 0x2);
G(2, 6, 0xA, 0xE, 0x4);
G(3, 7, 0xB, 0xF, 0x6);
/* diagonal step */
G(0, 5, 0xA, 0xF, 0x8);
G(1, 6, 0xB, 0xC, 0xA);
G(2, 7, 0x8, 0xD, 0xC);
G(3, 4, 0x9, 0xE, 0xE);
}
//#pragma unroll 16
for (i = 0; i < 16; i++) {
uint32_t idx = i % 8;
h[idx] ^= v[i];
}
}
__global__
void pentablake_gpu_hash_80(int threads, const uint32_t startNounce, void *outputHash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t h[8];
uint64_t buf[16];
uint32_t nounce = startNounce + thread;
//#pragma unroll 8
for(int i=0; i<8; i++)
h[i] = c_IV512[i];
//#pragma unroll 16
for (int i=0; i < 16; i++)
buf[i] = c_data[i];
// The test Nonce
((uint32_t*)buf)[19] = cuda_swab32(nounce);
pentablake_compress(h, buf, 640ULL);
#if __CUDA_ARCH__ < 300
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 8
for (uint32_t i=0; i < 8; i++) {
outHash[2*i] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
for (uint32_t i=0; i < 8; i++) {
outHash[i] = cuda_swab64( h[i] );
}
#endif
}
}
__host__
void pentablake_cpu_hash_80(int thr_id, int threads, const uint32_t startNounce, uint32_t *d_outputHash, int order)
{
const int threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
pentablake_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id);
cudaDeviceSynchronize();
}
__global__
void pentablake_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t *inpHash = &g_hash[thread<<3]; // hashPosition * 8
uint64_t buf[16]; // 128 Bytes
uint64_t h[8]; // State
#pragma unroll 8
for (int i=0; i<8; i++)
h[i] = c_IV512[i];
// Message for first round
#pragma unroll 8
for (int i=0; i < 8; ++i)
buf[i] = inpHash[i];
#pragma unroll 8
for (int i=0; i < 8; i++)
buf[i+8] = d_constHashPadding[i];
// Ending round
pentablake_compress(h, buf, 512);
#if __CUDA_ARCH__ < 300
uint32_t *outHash = (uint32_t*)&g_hash[thread<<3];
#pragma unroll 8
for (int i=0; i < 8; i++) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
uint64_t *outHash = &g_hash[thread<<3];
for (int i=0; i < 8; i++) {
outHash[i] = cuda_swab64(h[i]);
}
#endif
}
}
__host__
void pentablake_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
{
const int threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
pentablake_gpu_hash_64 <<<grid, block, shared_size>>> (threads, startNounce, (uint64_t*)d_outputHash);
//MyStreamSynchronize(NULL, order, thr_id);
cudaDeviceSynchronize();
}
#if 0
__host__
uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
{
const int threadsperblock = TPB;
uint32_t result = MAXU;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
/* Check error on Ctrl+C or kill to prevent segfaults on exit */
if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
return result;
pentablake_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_resNounce[thr_id]);
cudaDeviceSynchronize();
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
cudaThreadSynchronize();
result = h_resNounce[thr_id][0];
extra_results[0] = h_resNounce[thr_id][1];
}
return result;
}
#endif
__global__
void pentablake_gpu_check_hash(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *resNounce)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t nounce = startNounce + thread;
uint32_t *inpHash = &g_hash[thread<<4];
uint32_t h[8];
#pragma unroll 8
for (int i=0; i < 8; i++)
h[i] = inpHash[i];
for (int i = 7; i >= 0; i--) {
uint32_t hash = h[i]; // cuda_swab32(h[i]);
if (hash > c_Target[i]) {
return;
}
if (hash < c_Target[i]) {
break;
}
}
/* keep the smallest nounce, + extra one if found */
if (resNounce[0] > nounce) {
resNounce[1] = resNounce[0];
resNounce[0] = nounce;
}
else
resNounce[1] = nounce;
}
}
__host__ static
uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
{
const int threadsperblock = TPB;
uint32_t result = MAXU;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 0;
/* Check error on Ctrl+C or kill to prevent segfaults on exit */
if (cudaMemset(d_resNounce[thr_id], 0xff, 2*sizeof(uint32_t)) != cudaSuccess)
return result;
pentablake_gpu_check_hash <<<grid, block, shared_size>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
CUDA_SAFE_CALL(cudaDeviceSynchronize());
if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
cudaThreadSynchronize();
result = h_resNounce[thr_id][0];
extra_results[0] = h_resNounce[thr_id][1];
}
return result;
}
__host__
void pentablake_cpu_setBlock_80(uint32_t *pdata, const uint32_t *ptarget)
{
uint8_t data[128];
memcpy((void*) data, (void*) pdata, 80);
memset(data+80, 0, 48);
// to swab...
data[80] = 0x80;
data[111] = 1;
data[126] = 0x02;
data[127] = 0x80;
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_sigma, host_sigma, sizeof(host_sigma), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 32, 0, cudaMemcpyHostToDevice));
}
extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
static bool init[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
uint32_t throughput = min(128 * 2560, max_nonce - first_nonce);
uint32_t endiandata[20];
int rc = 0;
if (extra_results[0] != MAXU) {
// possible extra result found in previous call
if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
pdata[19] = extra_results[0];
*hashes_done = pdata[19] - first_nonce + 1;
extra_results[0] = MAXU;
rc = 1;
goto exit_scan;
}
}
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000F;
if (!init[thr_id]) {
if (opt_n_threads > 1) {
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 64 * throughput));
CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], 2*sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], 2*sizeof(uint32_t)));
init[thr_id] = true;
}
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
pentablake_cpu_setBlock_80(endiandata, ptarget);
do {
int order = 0;
// GPU HASH
pentablake_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
if (foundNonce != MAXU)
{
uint32_t vhashcpu[8];
uint32_t Htarg = ptarget[7];
be32enc(&endiandata[19], foundNonce);
pentablakehash(vhashcpu, endiandata);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
{
pdata[19] = foundNonce;
rc = 1;
// Rare but possible if the throughput is big
be32enc(&endiandata[19], extra_results[0]);
pentablakehash(vhashcpu, endiandata);
if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
applog(LOG_NOTICE, "GPU found more than one result yippee!");
rc = 2;
} else {
extra_results[0] = MAXU;
}
goto exit_scan;
}
else if (vhashcpu[7] > Htarg) {
applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
}
else if (vhashcpu[6] > ptarget[6]) {
applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]);
}
else {
applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
}
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
exit_scan:
*hashes_done = pdata[19] - first_nonce + 1;
#if 0
/* reset the device to allow multiple instances
* could be made in cpu-miner... check later if required */
if (opt_n_threads == 1) {
CUDA_SAFE_CALL(cudaDeviceReset());
init[thr_id] = false;
}
#endif
cudaDeviceSynchronize();
return rc;
}

156
quark/cuda_quark_blake512.cu

@ -50,59 +50,60 @@ const uint64_t c_u512[16] = @@ -50,59 +50,60 @@ const uint64_t c_u512[16] =
0x0801f2e2858efc16ULL, 0x636920d871574e69ULL
};
#define G(a,b,c,d,e) \
v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\
v[d] = ROTR( v[d] ^ v[a],32); \
v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c],25); \
v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \
v[d] = ROTR( v[d] ^ v[a],16); \
v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c],11);
#define G(a,b,c,d,x) { \
uint32_t idx1 = sigma[i][x]; \
uint32_t idx2 = sigma[i][x+1]; \
v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \
v[d] = ROTR( v[d] ^ v[a], 32); \
v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c], 25); \
v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \
v[d] = ROTR( v[d] ^ v[a], 16); \
v[c] += v[d]; \
v[b] = ROTR( v[b] ^ v[c], 11); \
}
__device__ static
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits )
void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0)
{
uint64_t v[16], m[16], i;
#pragma unroll 16
for( i = 0; i < 16; ++i ) {
m[i] = cuda_swab64(block[i]);
}
#pragma unroll 8
for( i = 0; i < 8; ++i ) v[i] = h[i];
v[ 8] = u512[0];
v[ 9] = u512[1];
v[10] = u512[2];
v[11] = u512[3];
v[12] = u512[4];
v[13] = u512[5];
v[14] = u512[6];
v[15] = u512[7];
v[12] ^= bits;
v[13] ^= bits;
//#pragma unroll 16
for( i = 0; i < 16; ++i )
{
/* column step */
G( 0, 4, 8, 12, 0 );
G( 1, 5, 9, 13, 2 );
G( 2, 6, 10, 14, 4 );
G( 3, 7, 11, 15, 6 );
/* diagonal step */
G( 0, 5, 10, 15, 8 );
G( 1, 6, 11, 12, 10 );
G( 2, 7, 8, 13, 12 );
G( 3, 4, 9, 14, 14 );
}
#pragma unroll 16
for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i];
#pragma unroll 16
for( i = 0; i < 16; i++) {
m[i] = cuda_swab64(block[i]);
}
#pragma unroll 8
for (i = 0; i < 8; i++)
v[i] = h[i];
v[ 8] = u512[0];
v[ 9] = u512[1];
v[10] = u512[2];
v[11] = u512[3];
v[12] = u512[4] ^ T0;
v[13] = u512[5] ^ T0;
v[14] = u512[6];
v[15] = u512[7];
//#pragma unroll 16
for( i = 0; i < 16; ++i )
{
/* column step */
G( 0, 4, 8, 12, 0 );
G( 1, 5, 9, 13, 2 );
G( 2, 6, 10, 14, 4 );
G( 3, 7, 11, 15, 6 );
/* diagonal step */
G( 0, 5, 10, 15, 8 );
G( 1, 6, 11, 12, 10 );
G( 2, 7, 8, 13, 12 );
G( 3, 4, 9, 14, 14 );
}
#pragma unroll 16
for( i = 0; i < 16; ++i )
h[i % 8] ^= v[i];
}
__device__ __constant__
@ -114,7 +115,8 @@ static const uint64_t d_constMem[8] = { @@ -114,7 +115,8 @@ static const uint64_t d_constMem[8] = {
0x510e527fade682d1ULL,
0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL,
0x5be0cd19137e2179ULL };
0x5be0cd19137e2179ULL
};
// Hash-Padding
__device__ __constant__
@ -126,7 +128,8 @@ static const uint64_t d_constHashPadding[8] = { @@ -126,7 +128,8 @@ static const uint64_t d_constHashPadding[8] = {
0,
0x0100000000000000ull,
0,
0x0002000000000000ull };
0x0002000000000000ull
};
__global__ __launch_bounds__(256, 4)
void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash)
@ -145,48 +148,42 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n @@ -145,48 +148,42 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n
if (thread < threads)
#endif
{
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8
// 128 Byte für die Message
// 128 Bytes
uint64_t buf[16];
// State vorbereiten
// State
uint64_t h[8];
#pragma unroll 8
for (i=0;i<8;i++)
for (int i=0;i<8;i++)
h[i] = d_constMem[i];
// Message für die erste Runde in Register holen
// Message for first round
#pragma unroll 8
for (i=0; i < 8; ++i)
for (int i=0; i < 8; ++i)
buf[i] = inpHash[i];
#pragma unroll 8
for (i=0; i < 8; i++)
for (int i=0; i < 8; i++)
buf[i+8] = d_constHashPadding[i];
// die einzige Hashing-Runde
// Ending round
quark_blake512_compress( h, buf, c_sigma, c_u512, 512 );
#if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
#if __CUDA_ARCH__ <= 350
uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition];
#pragma unroll 8
for (i=0; i < 8; ++i) {
for (int i=0; i < 8; i++) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = &g_hash[8 * hashPosition];
#pragma unroll 8
for (i=0; i < 8; ++i)
{
for (int i=0; i < 8; i++) {
outHash[i] = cuda_swab64(h[i]);
}
#endif
@ -198,45 +195,38 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo @@ -198,45 +195,38 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// State vorbereiten
uint64_t h[8];
// 128 Byte für die Message
uint64_t buf[16];
uint8_t i;
// bestimme den aktuellen Zähler
uint32_t nounce = startNounce + thread;
#pragma unroll 8
for(i=0;i<8;i++)
for(int i=0; i<8; i++)
h[i] = d_constMem[i];
// Message für die erste Runde in Register holen
#pragma unroll 16
for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i];
for (int i=0; i < 16; ++i)
buf[i] = c_PaddedMessage80[i];
// die Nounce durch die thread-spezifische ersetzen
buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce));
// The test Nonce
((uint32_t*)buf)[19] = cuda_swab32(nounce);
// die einzige Hashing-Runde
quark_blake512_compress( h, buf, c_sigma, c_u512, 640 );
// Hash rauslassen
#if __CUDA_ARCH__ >= 130
// ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind
#if __CUDA_ARCH__ <= 350
uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
#pragma unroll 8
for (i=0; i < 8; ++i) {
outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) );
for (uint32_t i=0; i < 8; i++) {
outHash[2*i] = cuda_swab32( _HIWORD(h[i]) );
outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) );
}
#else
// in dieser Version passieren auch ein paar 64 Bit Shifts
uint64_t *outHash = (uint64_t *)outputHash + 8 * thread;
#pragma unroll 8
for (i=0; i < 8; ++i) {
for (uint32_t i=0; i < 8; i++) {
outHash[i] = cuda_swab64( h[i] );
}
#endif
}
}

31
quark/quarkcoin.cu

@ -6,12 +6,12 @@ extern "C" @@ -6,12 +6,12 @@ extern "C"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
}
#include "miner.h"
#include "cuda_helper.h"
}
// aus cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
@ -70,76 +70,64 @@ extern "C" void quarkhash(void *state, const void *input) @@ -70,76 +70,64 @@ extern "C" void quarkhash(void *state, const void *input)
unsigned char hash[64];
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, (void*) hash);
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
if (hash[0] & 0x8)
{
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
}
else
{
sph_skein512_init(&ctx_skein);
// ZSKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
}
sph_groestl512_init(&ctx_groestl);
// ZGROESTL;
sph_groestl512 (&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
if (hash[0] & 0x8)
{
sph_blake512_init(&ctx_blake);
// ZBLAKE;
sph_blake512 (&ctx_blake, (const void*) hash, 64);
sph_blake512_close(&ctx_blake, (void*) hash);
}
else
{
sph_bmw512_init(&ctx_bmw);
// ZBMW;
sph_bmw512 (&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
}
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_skein512_init(&ctx_skein);
// SKEIN;
sph_skein512 (&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
if (hash[0] & 0x8)
{
sph_keccak512_init(&ctx_keccak);
// ZKECCAK;
sph_keccak512 (&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
}
else
{
sph_jh512_init(&ctx_jh);
// ZJH;
sph_jh512 (&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
}
@ -147,23 +135,17 @@ extern "C" void quarkhash(void *state, const void *input) @@ -147,23 +135,17 @@ extern "C" void quarkhash(void *state, const void *input)
memcpy(state, hash, 32);
}
extern bool opt_benchmark;
extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
const int throughput = 256*4096; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
const uint32_t Htarg = ptarget[7];
((uint32_t*)ptarget)[7] = 0x00FF;
const int throughput = 256*4096; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
@ -252,11 +234,12 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, @@ -252,11 +234,12 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
{
const uint32_t Htarg = ptarget[7];
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);
quarkhash(vhash64, endiandata);
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/2;

29
util.c

@ -1020,7 +1020,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) @@ -1020,7 +1020,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
int merkle_count, i;
json_t *merkle_arr;
unsigned char **merkle;
int ntime;
int ntime, hoffset;
job_id = json_string_value(json_array_get(params, 0));
prevhash = json_string_value(json_array_get(params, 1));
@ -1078,7 +1078,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) @@ -1078,7 +1078,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
hex2bin(sctx->job.coinbase, coinb1, coinb1_size);
memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size);
sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + 43);
hoffset = coinb1_size - 15; // 43;
sctx->bloc_height = le16dec((uint8_t*) sctx->job.coinbase + hoffset);
if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id))
memset(sctx->job.xnonce2, 0, sctx->xnonce2_size);
hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size);
@ -1125,7 +1126,7 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) @@ -1125,7 +1126,7 @@ static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params)
sctx->next_diff = diff;
pthread_mutex_unlock(&sctx->work_lock);
applog(LOG_INFO, "Stratum difficulty set to %g", diff);
applog(LOG_WARNING, "Stratum difficulty set to %g", diff);
return true;
}
@ -1221,10 +1222,6 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) @@ -1221,10 +1222,6 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
id = json_object_get(val, "id");
params = json_object_get(val, "params");
if (opt_debug_rpc) {
applog(LOG_DEBUG, "method: %s", s);
}
if (!strcasecmp(method, "mining.notify")) {
ret = stratum_notify(sctx, params);
goto out;
@ -1400,6 +1397,20 @@ static char* format_hash(char* buf, unsigned char *hash) @@ -1400,6 +1397,20 @@ static char* format_hash(char* buf, unsigned char *hash)
return buf;
}
/* to debug diff in data */
extern void applog_compare_hash(unsigned char *hash, unsigned char *hash2)
{
char s[256] = "";
int len = 0;
for (int i=0; i < 32; i += 4) {
char *color = memcmp(hash+i, hash2+i, 4) ? CL_RED : CL_GRY;
len += sprintf(s+len, "%s%02x%02x%02x%02x " CL_GRY, color,
hash[i], hash[i+1], hash[i+2], hash[i+3]);
s[len] = '\0';
}
applog(LOG_DEBUG, "%s", s);
}
extern void applog_hash(unsigned char *hash)
{
char s[128] = {'\0'};
@ -1457,6 +1468,10 @@ void print_hash_tests(void) @@ -1457,6 +1468,10 @@ void print_hash_tests(void)
nist5hash(&hash[0], &buf[0]);
printpfx("nist5", hash);
memset(hash, 0, sizeof hash);
pentablakehash(&hash[0], &buf[0]);
printpfx("pentablake", hash);
memset(hash, 0, sizeof hash);
quarkhash(&hash[0], &buf[0]);
printpfx("quark", hash);

18
x11/x11.cu

@ -21,10 +21,9 @@ extern "C" @@ -21,10 +21,9 @@ extern "C"
#include <memory.h>
}
// aus cpu-miner.c
// in cpu-miner.c
extern int device_map[8];
// Speicher für Input/Output der verketteten Hashfunktionen
static uint32_t *d_hash[8];
extern void quark_blake512_cpu_init(int thr_id, int threads);
@ -140,22 +139,17 @@ extern "C" void x11hash(void *output, const void *input) @@ -140,22 +139,17 @@ extern "C" void x11hash(void *output, const void *input)
}
extern bool opt_benchmark;
extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
const int throughput = 256*256*8;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*256*8;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
{
CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
@ -186,8 +180,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -186,8 +180,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
cuda_check_cpu_setTarget(ptarget);
do {
uint32_t foundNonce;
const uint32_t Htarg = ptarget[7];
int order = 0;
uint32_t foundNonce;
// Hash with CUDA
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -204,7 +200,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -204,7 +200,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
// Scan nach Gewinner Hashes auf der GPU
foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
if (foundNonce != 0xffffffff)
if (foundNonce != 0xffffffff)
{
uint32_t vhash64[8];
be32enc(&endiandata[19], foundNonce);

4
x15/x14.cu

@ -20,11 +20,11 @@ extern "C" { @@ -20,11 +20,11 @@ extern "C" {
#include "sph/sph_hamsi.h"
#include "sph/sph_fugue.h"
#include "sph/sph_shabal.h"
}
#include "miner.h"
#include "cuda_helper.h"
}
// from cpu-miner.c
extern int device_map[8];
@ -167,8 +167,6 @@ extern "C" void x14hash(void *output, const void *input) @@ -167,8 +167,6 @@ extern "C" void x14hash(void *output, const void *input)
}
extern bool opt_benchmark;
extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)

31
x15/x15.cu

@ -21,14 +21,11 @@ extern "C" { @@ -21,14 +21,11 @@ extern "C" {
#include "sph/sph_fugue.h"
#include "sph/sph_shabal.h"
#include "sph/sph_whirlpool.h"
}
#include "miner.h"
#include "cuda_helper.h"
}
// to test gpu hash on a null buffer
#define NULLTEST 0
// from cpu-miner.c
extern int device_map[8];
@ -92,8 +89,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads); @@ -92,8 +89,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
// X15 CPU Hash function
extern "C" void x15hash(void *output, const void *input)
{
@ -181,17 +176,6 @@ extern "C" void x15hash(void *output, const void *input) @@ -181,17 +176,6 @@ extern "C" void x15hash(void *output, const void *input)
memcpy(output, hash, 32);
}
#if NULLTEST
static void print_hash(unsigned char *hash)
{
for (int i=0; i < 32; i += 4) {
printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
}
}
#endif
extern bool opt_benchmark;
extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
const uint32_t *ptarget, uint32_t max_nonce,
unsigned long *hashes_done)
@ -203,12 +187,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, @@ -203,12 +187,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
uint32_t Htarg = ptarget[7];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
#if NULLTEST
for (int k=0; k < 20; k++)
pdata[k] = 0;
#endif
((uint32_t*)ptarget)[7] = Htarg = 0x00FF;
if (!init[thr_id])
{
@ -259,12 +238,6 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, @@ -259,12 +238,6 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
#if NULLTEST
uint32_t buf[8]; memset(buf, 0, sizeof buf);
CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaThreadSynchronize());
print_hash((unsigned char*)buf); printf("\n");
#endif
/* Scan with GPU */
uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

18
x17/x17.cu

@ -26,17 +26,15 @@ extern "C" @@ -26,17 +26,15 @@ extern "C"
#include "sph/sph_sha2.h"
#include "sph/sph_haval.h"
}
#include "miner.h"
}
#include "cuda_helper.h"
static uint32_t *d_hash[8];
// cpu-miner.c
// in cpu-miner.c
extern int device_map[8];
extern bool opt_benchmark;
extern void quark_blake512_cpu_init(int thr_id, int threads);
extern void quark_blake512_cpu_setBlock_80(void *pdata);
@ -204,20 +202,12 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata, @@ -204,20 +202,12 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
unsigned long *hashes_done)
{
const uint32_t first_nonce = pdata[19];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
const int throughput = 256*256*8;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
static bool init[8] = {0,0,0,0,0,0,0,0};
uint32_t Htarg = ptarget[7];
if (opt_benchmark)
((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
((uint32_t*)ptarget)[7] = Htarg = 0x00FF;
if (!init[thr_id])
{

Loading…
Cancel
Save