Browse Source

wildkeccak, basic stratum port of rpc 2.0

scratchpad delete fix and redownload, reduce rejects

(work in progress)
master
Tanguy Pruvot 8 years ago
parent
commit
2bbccc5ff4
  1. 1
      Makefile.am
  2. 12
      README.txt
  3. 2
      algos.h
  4. 1
      bench.cpp
  5. 107
      ccminer.cpp
  6. 13
      ccminer.vcxproj
  7. 24
      ccminer.vcxproj.filters
  8. 2
      compat/ccminer-config.h
  9. 2
      configure.ac
  10. 2
      configure.sh
  11. 573
      crypto/int128_c.h
  12. 180
      crypto/mman.c
  13. 63
      crypto/mman.h
  14. 371
      crypto/wildkeccak-cpu.cpp
  15. 369
      crypto/wildkeccak.cu
  16. 10
      crypto/wildkeccak.h
  17. 1257
      crypto/xmr-rpc.cpp
  18. 41
      crypto/xmr-rpc.h
  19. 6
      miner.h
  20. 4
      pools.cpp
  21. 8
      res/ccminer.rc
  22. 11
      util.cpp

1
Makefile.am

@ -39,6 +39,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -39,6 +39,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \
Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
Algo256/blake2s.cu sph/blake2s.c \
Algo256/bmw.cu Algo256/cuda_bmw.cu \
crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \
JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \

12
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccminer 1.8.4 (Dec. 2016) "Winter boosts: sib, veltor, blake2s"
ccminer 2.0 (December 2016) "Boolberry's WildKeccak RPC 2.0"
---------------------------------------------------------------
***************************************************************
@ -51,6 +51,7 @@ Woodcoin (Double Skein) @@ -51,6 +51,7 @@ Woodcoin (Double Skein)
Vanilla (Blake256 8-rounds - double sha256)
Vertcoin Lyra2RE
Ziftrcoin (ZR5)
Boolberry (Wild Keccak)
where some of these coins have a VERY NOTABLE nVidia advantage
over competing AMD (OpenCL Only) implementations.
@ -110,6 +111,7 @@ its command line interface and options. @@ -110,6 +111,7 @@ its command line interface and options.
vanilla use to mine Vanilla (Blake256)
veltor use to mine VeltorCoin
whirlpool use to mine Joincoin
wildkeccak use to mine Boolberry (Stratum only)
zr5 use to mine ZiftrCoin
-d, --devices gives a comma separated list of CUDA device IDs
@ -184,6 +186,12 @@ Scrypt specific options: @@ -184,6 +186,12 @@ Scrypt specific options:
--no-autotune disable auto-tuning of kernel launch parameters
Boolberry specific:
-l, --launch-config gives the launch configuration for each kernel
in a comma separated list, one per device.
-k, --scratchpad url Url used to download the scratchpad cache.
>>> Examples <<<
@ -206,6 +214,8 @@ Example for Fuguecoin pool mining on dwarfpool.com with all your GPUs @@ -206,6 +214,8 @@ Example for Fuguecoin pool mining on dwarfpool.com with all your GPUs
Example for Groestlcoin solo mining
ccminer -q -s 1 -a groestl -o http://127.0.0.1:1441/ -u USERNAME -p PASSWORD
Example for Boolberry
ccminer -a wildkeccak -o stratum+tcp://bbr.suprnova.cc:7777 -u tpruvot.donate -p x -k http://bbr.suprnova.cc/scratchpad.bin -l 64x360
Example for Scrypt-N (2048) on Nicehash
ccminer -a scrypt:10 -o stratum+tcp://stratum.nicehash.com:3335 -u 3EujYFcoBzWvpUEvbe3obEG95mBuU88QBD -p x

2
algos.h

@ -48,6 +48,7 @@ enum sha_algos { @@ -48,6 +48,7 @@ enum sha_algos {
ALGO_WHIRLCOIN,
ALGO_WHIRLPOOL,
ALGO_WHIRLPOOLX,
ALGO_WILDKECCAK,
ALGO_ZR5,
ALGO_AUTO,
ALGO_COUNT
@ -99,6 +100,7 @@ static const char *algo_names[] = { @@ -99,6 +100,7 @@ static const char *algo_names[] = {
"whirlcoin",
"whirlpool",
"whirlpoolx",
"wildkeccak",
"zr5",
"auto", /* reserved for multi algo */
""

1
bench.cpp

@ -75,6 +75,7 @@ void algo_free_all(int thr_id) @@ -75,6 +75,7 @@ void algo_free_all(int thr_id)
free_veltor(thr_id);
free_whirl(thr_id);
//free_whirlx(thr_id);
free_wildkeccak(thr_id);
free_x11evo(thr_id);
free_x11(thr_id);
free_x13(thr_id);

107
ccminer.cpp

@ -43,6 +43,8 @@ @@ -43,6 +43,8 @@
#include "miner.h"
#include "algos.h"
#include "sia/sia-rpc.h"
#include "crypto/xmr-rpc.h"
#include <cuda_runtime.h>
#ifdef WIN32
@ -95,7 +97,7 @@ bool allow_mininginfo = true; @@ -95,7 +97,7 @@ bool allow_mininginfo = true;
bool check_dups = false;
bool check_stratum_jobs = false;
static bool submit_old = false;
bool submit_old = false;
bool use_syslog = false;
bool use_colors = true;
int use_pok = 0;
@ -160,6 +162,8 @@ volatile bool pool_is_switching = false; @@ -160,6 +162,8 @@ volatile bool pool_is_switching = false;
volatile int pool_switch_count = 0;
bool conditional_pool_rotate = false;
extern char* opt_scratchpad_url;
// current connection
char *rpc_user = NULL;
char *rpc_pass;
@ -257,6 +261,7 @@ Options:\n\ @@ -257,6 +261,7 @@ Options:\n\
x14 X14\n\
x15 X15\n\
x17 X17\n\
wildkeccak Boolberry\n\
zr5 ZR5 (ZiftrCoin)\n\
-d, --devices Comma separated list of CUDA devices to use.\n\
Device IDs start counting from 0! Alternatively takes\n\
@ -335,7 +340,7 @@ static char const short_options[] = @@ -335,7 +340,7 @@ static char const short_options[] =
#ifdef HAVE_SYSLOG_H
"S"
#endif
"a:Bc:i:Dhp:Px:f:m:nqr:R:s:t:T:o:u:O:Vd:N:b:l:L:";
"a:Bc:k:i:Dhp:Px:f:m:nqr:R:s:t:T:o:u:O:Vd:N:b:l:L:";
struct option options[] = {
{ "algo", 1, NULL, 'a' },
@ -360,9 +365,10 @@ struct option options[] = { @@ -360,9 +365,10 @@ struct option options[] = {
{ "no-stratum", 0, NULL, 1007 },
{ "no-autotune", 0, NULL, 1004 }, // scrypt
{ "interactive", 1, NULL, 1050 }, // scrypt
{ "launch-config", 1, NULL, 'l' }, // scrypt
{ "lookup-gap", 1, NULL, 'L' }, // scrypt
{ "texture-cache", 1, NULL, 1051 },// scrypt
{ "launch-config", 1, NULL, 'l' }, // scrypt & bbr
{ "scratchpad", 1, NULL, 'k' }, // bbr
{ "max-temp", 1, NULL, 1060 },
{ "max-diff", 1, NULL, 1061 },
{ "max-rate", 1, NULL, 1062 },
@ -431,6 +437,13 @@ Scrypt specific options:\n\ @@ -431,6 +437,13 @@ Scrypt specific options:\n\
--no-autotune disable auto-tuning of kernel launch parameters\n\
";
static char const xmr_usage[] = "\n\
CryptoNote specific options:\n\
-l, --launch-config gives the launch configuration for each kernel\n\
in a comma separated list, one per device.\n\
-k, --scratchpad url Url used to download the scratchpad cache.\n\
";
struct work _ALIGN(64) g_work;
volatile time_t g_work_time;
pthread_mutex_t g_work_lock;
@ -544,8 +557,7 @@ void proper_exit(int reason) @@ -544,8 +557,7 @@ void proper_exit(int reason)
exit(reason);
}
static bool jobj_binary(const json_t *obj, const char *key,
void *buf, size_t buflen)
bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen)
{
const char *hexstr;
json_t *tmp;
@ -608,6 +620,8 @@ static bool work_decode(const json_t *val, struct work *work) @@ -608,6 +620,8 @@ static bool work_decode(const json_t *val, struct work *work)
data_size = 80;
adata_sz = data_size / 4;
break;
case ALGO_WILDKECCAK:
return rpc2_job_decode(val, work);
default:
data_size = 128;
adata_sz = data_size / 4;
@ -780,6 +794,18 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -780,6 +794,18 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
bool stale_work = false;
int idnonce = 0;
if (pool->type & POOL_STRATUM && stratum.rpc2) {
struct work submit_work;
memcpy(&submit_work, work, sizeof(struct work));
bool sent = hashlog_already_submittted(submit_work.job_id, submit_work.nonces[0]);
if (sent) {
return true;
}
bool ret = rpc2_stratum_submit(pool, &submit_work);
hashlog_remember_submit(&submit_work, submit_work.nonces[0]);
return ret;
}
/* discard if a newer block was received */
stale_work = work->height && work->height < g_work.height;
if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
@ -1405,6 +1431,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1405,6 +1431,9 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
uchar merkle_root[64] = { 0 };
int i;
if (sctx->rpc2)
return rpc2_stratum_gen_work(sctx, work);
if (!sctx->job.job_id) {
// applog(LOG_WARNING, "stratum_gen_work: job not yet retrieved");
return false;
@ -1733,10 +1762,16 @@ static void *miner_thread(void *userdata) @@ -1733,10 +1762,16 @@ static void *miner_thread(void *userdata)
uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
if (opt_algo == ALGO_WILDKECCAK) {
nonceptr = (uint32_t*) (((char*)work.data) + 1);
wcmpoft = 2;
wcmplen = 32;
}
if (have_stratum) {
uint32_t sleeptime = 0;
if (opt_algo == ALGO_DECRED)
if (opt_algo == ALGO_DECRED || stratum.rpc2)
work_done = true; // force "regen" hash
while (!work_done && time(NULL) >= (g_work_time + opt_scantime)) {
usleep(100*1000);
@ -1748,7 +1783,7 @@ static void *miner_thread(void *userdata) @@ -1748,7 +1783,7 @@ static void *miner_thread(void *userdata)
}
if (sleeptime && opt_debug && !opt_quiet)
applog(LOG_DEBUG, "sleeptime: %u ms", sleeptime*100);
nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
//nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
pthread_mutex_lock(&g_work_lock);
extrajob |= work_done;
@ -1835,6 +1870,8 @@ static void *miner_thread(void *userdata) @@ -1835,6 +1870,8 @@ static void *miner_thread(void *userdata)
nonceptr[1] += 1;
nonceptr[2] |= thr_id;
} else if (opt_algo == ALGO_WILDKECCAK) {
//nonceptr[1] += 1;
} else if (opt_algo == ALGO_SIA) {
// suprnova job_id check without data/target/height change...
if (have_stratum && strcmp(work.job_id, g_work.job_id)) {
@ -1879,6 +1916,11 @@ static void *miner_thread(void *userdata) @@ -1879,6 +1916,11 @@ static void *miner_thread(void *userdata)
gpulog(LOG_DEBUG, thr_id, "no data");
continue;
}
if (stratum.rpc2 && !scratchpad_size) {
sleep(1);
if (!thr_id) pools[cur_pooln].wait_time += 1;
continue;
}
/* conditional mining */
if (!wanna_mine(thr_id)) {
@ -2196,6 +2238,9 @@ static void *miner_thread(void *userdata) @@ -2196,6 +2238,9 @@ static void *miner_thread(void *userdata)
//case ALGO_WHIRLPOOLX:
// rc = scanhash_whirlx(thr_id, &work, max_nonce, &hashes_done);
// break;
case ALGO_WILDKECCAK:
rc = scanhash_wildkeccak(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_X11EVO:
rc = scanhash_x11evo(thr_id, &work, max_nonce, &hashes_done);
break;
@ -2242,6 +2287,7 @@ static void *miner_thread(void *userdata) @@ -2242,6 +2287,7 @@ static void *miner_thread(void *userdata)
case ALGO_LBRY:
case ALGO_SIA:
case ALGO_VELTOR:
case ALGO_WILDKECCAK:
// migrated algos
break;
case ALGO_ZR5:
@ -2254,6 +2300,13 @@ static void *miner_thread(void *userdata) @@ -2254,6 +2300,13 @@ static void *miner_thread(void *userdata)
work.nonces[1] = nonceptr[2];
}
if (stratum.rpc2 && rc == -EBUSY || work_restart[thr_id].restart) {
// bbr scratchpad download or stale result
sleep(1);
if (!thr_id) pools[cur_pooln].wait_time += 1;
continue;
}
if (rc > 0 && opt_debug)
applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", work.nonces[0], swab32(work.nonces[0]));
if (rc > 1 && opt_debug)
@ -2548,7 +2601,7 @@ static bool stratum_handle_response(char *buf) @@ -2548,7 +2601,7 @@ static bool stratum_handle_response(char *buf)
err_val = json_object_get(val, "error");
id_val = json_object_get(val, "id");
if (!id_val || json_is_null(id_val) || !res_val)
if (!id_val || json_is_null(id_val))
goto out;
// ignore late login answers
@ -2564,8 +2617,24 @@ static bool stratum_handle_response(char *buf) @@ -2564,8 +2617,24 @@ static bool stratum_handle_response(char *buf)
// store time required to the pool to answer to a submit
stratum.answer_msec = (1000 * diff.tv_sec) + (uint32_t) (0.001 * diff.tv_usec);
share_result(json_is_true(res_val), stratum.pooln, stratum.sharediff,
err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);
if (stratum.rpc2) {
const char* reject_reason = err_val ? json_string_value(json_object_get(err_val, "message")) : NULL;
// {"id":4,"jsonrpc":"2.0","error":null,"result":{"status":"OK"}}
share_result(json_is_null(err_val), stratum.pooln, stratum.sharediff, reject_reason);
if (reject_reason) {
g_work_time = 0;
restart_threads();
}
} else {
if (!res_val)
goto out;
share_result(json_is_true(res_val), stratum.pooln, stratum.sharediff,
err_val ? json_string_value(json_array_get(err_val, 1)) : NULL);
}
ret = true;
out:
@ -2641,6 +2710,10 @@ wait_stratum_url: @@ -2641,6 +2710,10 @@ wait_stratum_url:
}
}
if (opt_algo == ALGO_WILDKECCAK) {
rpc2_stratum_thread_stuff(pool);
}
if (switchn != pool_switch_count) goto pool_switched;
if (stratum.job.job_id &&
@ -2733,6 +2806,9 @@ static void show_usage_and_exit(int status) @@ -2733,6 +2806,9 @@ static void show_usage_and_exit(int status)
if (opt_algo == ALGO_SCRYPT || opt_algo == ALGO_SCRYPT_JANE) {
printf(scrypt_usage);
}
if (opt_algo == ALGO_WILDKECCAK) {
printf(xmr_usage);
}
proper_exit(status);
}
@ -2814,6 +2890,9 @@ void parse_arg(int key, char *arg) @@ -2814,6 +2890,9 @@ void parse_arg(int key, char *arg)
}
break;
}
case 'k':
opt_scratchpad_url = strdup(arg);
break;
case 'i':
d = atof(arg);
v = (uint32_t) d;
@ -3011,7 +3090,7 @@ void parse_arg(int key, char *arg) @@ -3011,7 +3090,7 @@ void parse_arg(int key, char *arg)
case 1004:
opt_autotune = false;
break;
case 'l': /* scrypt --launch-config */
case 'l': /* --launch-config */
{
char *last = NULL, *pch = strtok(arg,",");
int n = 0;
@ -3582,6 +3661,12 @@ int main(int argc, char *argv[]) @@ -3582,6 +3661,12 @@ int main(int argc, char *argv[])
allow_mininginfo = false;
}
if (opt_algo == ALGO_WILDKECCAK) {
rpc2_init();
applog(LOG_INFO, "Using CryptoNote JSON-RPC 2.0");
GetScratchpad();
}
flags = !opt_benchmark && strncmp(rpc_url, "https:", 6)
? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL)
: CURL_GLOBAL_ALL;

13
ccminer.vcxproj

@ -39,7 +39,7 @@ @@ -39,7 +39,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
</ImportGroup>
<ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
@ -230,6 +230,9 @@ @@ -230,6 +230,9 @@
<TreatWChar_tAsBuiltInType>false</TreatWChar_tAsBuiltInType>
<Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
</ClCompile>
<ClCompile Include="crypto\mman.c" />
<ClCompile Include="crypto\wildkeccak-cpu.cpp" />
<ClCompile Include="crypto\xmr-rpc.cpp" />
<ClCompile Include="nvapi.cpp" />
<ClCompile Include="pools.cpp" />
<ClCompile Include="util.cpp" />
@ -248,6 +251,9 @@ @@ -248,6 +251,9 @@
<ClCompile Include="myriadgroestl.cpp" />
<ClCompile Include="lyra2\Lyra2.c" />
<ClCompile Include="lyra2\Sponge.c" />
<ClInclude Include="crypto\mman.h" />
<ClInclude Include="crypto\wildkeccak.h" />
<ClInclude Include="crypto\xmr-rpc.h" />
<ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
<ClInclude Include="neoscrypt\neoscrypt.h" />
<ClCompile Include="neoscrypt\neoscrypt.cpp" />
@ -258,6 +264,9 @@ @@ -258,6 +264,9 @@
<CudaCompile Include="Algo256\cuda_bmw.cu">
<MaxRegCount>76</MaxRegCount>
</CudaCompile>
<CudaCompile Include="crypto\wildkeccak.cu">
<MaxRegCount Condition="'$(Configuration)|$(Platform)'=='Release|x64'">128</MaxRegCount>
</CudaCompile>
<CudaCompile Include="neoscrypt\cuda_neoscrypt.cu">
<MaxRegCount>160</MaxRegCount>
</CudaCompile>
@ -531,7 +540,7 @@ @@ -531,7 +540,7 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
</ImportGroup>
<ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />

24
ccminer.vcxproj.filters

@ -88,6 +88,9 @@ @@ -88,6 +88,9 @@
<Filter Include="Source Files\sia">
<UniqueIdentifier>{86a896c0-1688-4854-98e0-285d166069a3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\crypto">
<UniqueIdentifier>{fea0fce3-c0fe-42f7-aa37-0cbba10b008a}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="compat\jansson\dump.c">
@ -279,6 +282,15 @@ @@ -279,6 +282,15 @@
<ClCompile Include="sia\sia-rpc.cpp">
<Filter>Source Files\sia</Filter>
</ClCompile>
<ClCompile Include="crypto\xmr-rpc.cpp">
<Filter>Source Files\crypto</Filter>
</ClCompile>
<ClCompile Include="crypto\mman.c">
<Filter>Source Files\crypto</Filter>
</ClCompile>
<ClCompile Include="crypto\wildkeccak-cpu.cpp">
<Filter>Source Files\crypto</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="algos.h">
@ -482,6 +494,15 @@ @@ -482,6 +494,15 @@
<ClInclude Include="sia\sia-rpc.h">
<Filter>Source Files\sia</Filter>
</ClInclude>
<ClInclude Include="crypto\mman.h">
<Filter>Source Files\crypto</Filter>
</ClInclude>
<ClInclude Include="crypto\wildkeccak.h">
<Filter>Source Files\crypto</Filter>
</ClInclude>
<ClInclude Include="crypto\xmr-rpc.h">
<Filter>Source Files\crypto</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CudaCompile Include="cuda.cpp">
@ -760,6 +781,9 @@ @@ -760,6 +781,9 @@
<CudaCompile Include="sia\sia.cu">
<Filter>Source Files\sia</Filter>
</CudaCompile>
<CudaCompile Include="crypto\wildkeccak.cu">
<Filter>Source Files\crypto</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<Image Include="res\ccminer.ico">

2
compat/ccminer-config.h

@ -164,7 +164,7 @@ @@ -164,7 +164,7 @@
#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */
#define PACKAGE_VERSION "1.8.4"
#define PACKAGE_VERSION "2.0"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be

2
configure.ac

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
AC_INIT([ccminer], [1.8.4], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_INIT([ccminer], [2.0], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

2
configure.sh

@ -3,5 +3,5 @@ @@ -3,5 +3,5 @@
extracflags="-march=native -D_REENTRANT -falign-functions=16 -falign-jumps=16 -falign-labels=16"
CUDA_CFLAGS="-O3 -lineno -Xcompiler -Wall -D_FORCE_INLINES" \
./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda --with-nvml=libnvidia-ml.so
./configure CXXFLAGS="-O3 $extracflags" --with-cuda=/usr/local/cuda-7.5 --with-nvml=libnvidia-ml.so

573
crypto/int128_c.h

@ -0,0 +1,573 @@ @@ -0,0 +1,573 @@
#ifndef UINT128_C_H
#define UINT128_C_H
struct __uint128 {
uint64_t Hi;
uint64_t Lo;
};
typedef struct __uint128 uint128;
void Increment(uint128 * N)
{
uint64_t T = (N->Lo + 1);
N->Hi += ((N->Lo ^T) & N->Lo) >> 63;
N->Lo = T;
}
void Decrement(uint128 * N)
{
uint64_t T = (N->Lo - 1);
N->Hi -= ((T ^ N->Lo) & T) >> 63;
N->Lo = T;
}
void Add(uint128 * Ans, uint128 N, uint128 M)
{
uint64_t C = (((N.Lo & M.Lo) & 1) + (N.Lo >> 1) + (M.Lo >> 1)) >> 63;
Ans->Hi = N.Hi + M.Hi + C;
Ans->Lo = N.Lo + M.Lo;
}
void Subtract(uint128 * Ans, uint128 N, uint128 M)
{
Ans->Lo = N.Lo - M.Lo;
uint64_t C = (((Ans->Lo & M.Lo) & 1) + (M.Lo >> 1) + (Ans->Lo >> 1)) >> 63;
Ans->Hi = N.Hi - (M.Hi + C);
}
void inc128(uint128 N, uint128* A)
{
A->Lo = (N.Lo + 1);
A->Hi = N.Hi + (((N.Lo ^ A->Lo) & N.Lo) >> 63);
}
void dec128(uint128 N, uint128* A)
{
A->Lo = N.Lo - 1;
A->Hi = N.Hi - (((A->Lo ^ N.Lo) & A->Lo) >> 63);
}
void add128(uint128 N, uint128 M, uint128* A)
{
uint64_t C = (((N.Lo & M.Lo) & 1) + (N.Lo >> 1) + (M.Lo >> 1)) >> 63;
A->Hi = N.Hi + M.Hi + C;
A->Lo = N.Lo + M.Lo;
}
void sub128(uint128 N, uint128 M, uint128* A)
{
A->Lo = N.Lo - M.Lo;
uint64_t C = (((A->Lo & M.Lo) & 1) + (M.Lo >> 1) + (A->Lo >> 1)) >> 63;
A->Hi = N.Hi - (M.Hi + C);
}
void mult64to128(uint64_t u, uint64_t v, uint64_t * h, uint64_t *l)
{
uint64_t u1 = (u & 0xffffffff);
uint64_t v1 = (v & 0xffffffff);
uint64_t t = (u1 * v1);
uint64_t w3 = (t & 0xffffffff);
uint64_t k = (t >> 32);
u >>= 32;
t = (u * v1) + k;
k = (t & 0xffffffff);
uint64_t w1 = (t >> 32);
v >>= 32;
t = (u1 * v) + k;
k = (t >> 32);
*h = (u * v) + w1 + k;
*l = (t << 32) + w3;
}
void mult128(uint128 N, uint128 M, uint128 * Ans)
{
mult64to128(N.Lo, M.Lo, &Ans->Hi, &Ans->Lo);
Ans->Hi += (N.Hi * M.Lo) + (N.Lo * M.Hi);
}
void mult128to256(uint128 N, uint128 M, uint128 * H, uint128 * L)
{
mult64to128(N.Hi, M.Hi, &H->Hi, &H->Lo);
mult64to128(N.Lo, M.Lo, &L->Hi, &L->Lo);
uint128 T;
mult64to128(N.Hi, M.Lo, &T.Hi, &T.Lo);
L->Hi += T.Lo;
if(L->Hi < T.Lo) // if L->Hi overflowed
{
Increment(H);
}
H->Lo += T.Hi;
if(H->Lo < T.Hi) // if H->Lo overflowed
{
++H->Hi;
}
mult64to128(N.Lo, M.Hi, &T.Hi, &T.Lo);
L->Hi += T.Lo;
if(L->Hi < T.Lo) // if L->Hi overflowed
{
Increment(H);
}
H->Lo += T.Hi;
if(H->Lo < T.Hi) // if H->Lo overflowed
{
++H->Hi;
}
}
void sqr64to128(uint64_t r, uint64_t * h, uint64_t *l)
{
uint64_t r1 = (r & 0xffffffff);
uint64_t t = (r1 * r1);
uint64_t w3 = (t & 0xffffffff);
uint64_t k = (t >> 32);
r >>= 32;
uint64_t m = (r * r1);
t = m + k;
uint64_t w2 = (t & 0xffffffff);
uint64_t w1 = (t >> 32);
t = m + w2;
k = (t >> 32);
*h = (r * r) + w1 + k;
*l = (t << 32) + w3;
}
void sqr128(uint128 R, uint128 * Ans)
{
sqr64to128(R.Lo, &Ans->Hi, &Ans->Lo);
Ans->Hi += (R.Hi * R.Lo) << 1;
}
void sqr128to256(uint128 R, uint128 * H, uint128 * L)
{
sqr64to128(R.Hi, &H->Hi, &H->Lo);
sqr64to128(R.Lo, &L->Hi, &L->Lo);
uint128 T;
mult64to128(R.Hi, R.Lo, &T.Hi, &T.Lo);
H->Hi += (T.Hi >> 63);
T.Hi = (T.Hi << 1) | (T.Lo >> 63); // Shift Left 1 bit
T.Lo <<= 1;
L->Hi += T.Lo;
if(L->Hi < T.Lo) // if L->Hi overflowed
{
Increment(H);
}
H->Lo += T.Hi;
if(H->Lo < T.Hi) // if H->Lo overflowed
{
++H->Hi;
}
}
void shiftleft128(uint128 N, size_t S, uint128 * A)
{
uint64_t M1, M2;
S &= 127;
M1 = ((((S + 127) | S) & 64) >> 6) - 1llu;
M2 = (S >> 6) - 1llu;
S &= 63;
A->Hi = (N.Lo << S) & (~M2);
A->Lo = (N.Lo << S) & M2;
A->Hi |= ((N.Hi << S) | ((N.Lo >> (64 - S)) & M1)) & M2;
/*
S &= 127;
if(S != 0)
{
if(S > 64)
{
A.Hi = N.Lo << (S - 64);
A.Lo = 0;
}
else if(S < 64)
{
A.Hi = (N.Hi << S) | (N.Lo >> (64 - S));
A.Lo = N.Lo << S;
}
else
{
A.Hi = N.Lo;
A.Lo = 0;
}
}
else
{
A.Hi = N.Hi;
A.Lo = N.Lo;
}
//*/
}
void shiftright128(uint128 N, size_t S, uint128 * A)
{
uint64_t M1, M2;
S &= 127;
M1 = ((((S + 127) | S) & 64) >> 6) - 1llu;
M2 = (S >> 6) - 1llu;
S &= 63;
A->Lo = (N.Hi >> S) & (~M2);
A->Hi = (N.Hi >> S) & M2;
A->Lo |= ((N.Lo >> S) | ((N.Hi << (64 - S)) & M1)) & M2;
/*
S &= 127;
if(S != 0)
{
if(S > 64)
{
A.Hi = N.Hi >> (S - 64);
A.Lo = 0;
}
else if(S < 64)
{
A.Lo = (N.Lo >> S) | (N.Hi << (64 - S));
A.Hi = N.Hi >> S;
}
else
{
A.Lo = N.Hi;
A.Hi = 0;
}
}
else
{
A.Hi = N.Hi;
A.Lo = N.Lo;
}
//*/
}
void not128(uint128 N, uint128 * A)
{
A->Hi = ~N.Hi;
A->Lo = ~N.Lo;
}
void or128(uint128 N1, uint128 N2, uint128 * A)
{
A->Hi = N1.Hi | N2.Hi;
A->Lo = N1.Lo | N2.Lo;
}
void and128(uint128 N1, uint128 N2, uint128 * A)
{
A->Hi = N1.Hi & N2.Hi;
A->Lo = N1.Lo & N2.Lo;
}
void xor128(uint128 N1, uint128 N2, uint128 * A)
{
A->Hi = N1.Hi ^ N2.Hi;
A->Lo = N1.Lo ^ N2.Lo;
}
size_t nlz64(uint64_t N)
{
uint64_t I;
size_t C;
I = ~N;
C = ((I ^ (I + 1)) & I) >> 63;
I = (N >> 32) + 0xffffffff;
I = ((I & 0x100000000) ^ 0x100000000) >> 27;
C += I; N <<= I;
I = (N >> 48) + 0xffff;
I = ((I & 0x10000) ^ 0x10000) >> 12;
C += I; N <<= I;
I = (N >> 56) + 0xff;
I = ((I & 0x100) ^ 0x100) >> 5;
C += I; N <<= I;
I = (N >> 60) + 0xf;
I = ((I & 0x10) ^ 0x10) >> 2;
C += I; N <<= I;
I = (N >> 62) + 3;
I = ((I & 4) ^ 4) >> 1;
C += I; N <<= I;
C += (N >> 63) ^ 1;
return C;
}
size_t ntz64(uint64_t N)
{
uint64_t I = ~N;
size_t C = ((I ^ (I + 1)) & I) >> 63;
I = (N & 0xffffffff) + 0xffffffff;
I = ((I & 0x100000000) ^ 0x100000000) >> 27;
C += I; N >>= I;
I = (N & 0xffff) + 0xffff;
I = ((I & 0x10000) ^ 0x10000) >> 12;
C += I; N >>= I;
I = (N & 0xff) + 0xff;
I = ((I & 0x100) ^ 0x100) >> 5;
C += I; N >>= I;
I = (N & 0xf) + 0xf;
I = ((I & 0x10) ^ 0x10) >> 2;
C += I; N >>= I;
I = (N & 3) + 3;
I = ((I & 4) ^ 4) >> 1;
C += I; N >>= I;
C += ((N & 1) ^ 1);
return C;
}
size_t popcnt64(uint64_t V)
{
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
V -= ((V >> 1) & 0x5555555555555555);
V = (V & 0x3333333333333333) + ((V >> 2) & 0x3333333333333333);
return ((V + (V >> 4) & 0xF0F0F0F0F0F0F0F) * 0x101010101010101) >> 56;
}
size_t popcnt128(uint128 N)
{
return popcnt64(N.Hi) + popcnt64(N.Lo);
}
size_t nlz128(uint128 N)
{
return (N.Hi == 0) ? nlz64(N.Lo) + 64 : nlz64(N.Hi);
}
size_t ntz128(uint128 N)
{
return (N.Lo == 0) ? ntz64(N.Hi) + 64 : ntz64(N.Lo);
}
int compare128(uint128 N1, uint128 N2)
{
return (((N1.Hi > N2.Hi) || ((N1.Hi == N2.Hi) && (N1.Lo > N2.Lo))) ? 1 : 0)
- (((N1.Hi < N2.Hi) || ((N1.Hi == N2.Hi) && (N1.Lo < N2.Lo))) ? 1 : 0);
}
void bindivmod128(uint128 M, uint128 N, uint128 * Q, uint128 *R)
{
Q->Hi = Q->Lo = 0;
size_t Shift = nlz128(N) - nlz128(M);
shiftleft128(N, Shift, &N);
do
{
shiftleft128(*Q, (size_t)1, Q);
if(compare128(M, N) >= 0)
{
sub128(M, N, &M);
Q->Lo |= 1;
}
shiftright128(N, 1, &N);
}while(Shift-- != 0);
R->Hi = M.Hi;
R->Lo = M.Lo;
}
void divmod128by64(const uint64_t u1, const uint64_t u0, uint64_t v, uint64_t * q, uint64_t * r)
{
const uint64_t b = 1ll << 32;
uint64_t un1, un0, vn1, vn0, q1, q0, un32, un21, un10, rhat, left, right;
size_t s;
s = nlz64(v);
v <<= s;
vn1 = v >> 32;
vn0 = v & 0xffffffff;
if (s > 0)
{
un32 = (u1 << s) | (u0 >> (64 - s));
un10 = u0 << s;
}
else
{
un32 = u1;
un10 = u0;
}
un1 = un10 >> 32;
un0 = un10 & 0xffffffff;
q1 = un32 / vn1;
rhat = un32 % vn1;
left = q1 * vn0;
right = (rhat << 32) + un1;
again1:
if ((q1 >= b) || (left > right))
{
--q1;
rhat += vn1;
if (rhat < b)
{
left -= vn0;
right = (rhat << 32) | un1;
goto again1;
}
}
un21 = (un32 << 32) + (un1 - (q1 * v));
q0 = un21 / vn1;
rhat = un21 % vn1;
left = q0 * vn0;
right = (rhat << 32) | un0;
again2:
if ((q0 >= b) || (left > right))
{
--q0;
rhat += vn1;
if (rhat < b)
{
left -= vn0;
right = (rhat << 32) | un0;
goto again2;
}
}
*r = ((un21 << 32) + (un0 - (q0 * v))) >> s;
*q = (q1 << 32) | q0;
}
static void divmod128by128(uint128 M, uint128 N, uint128 * Q, uint128 * R)
{
if (N.Hi == 0)
{
if (M.Hi < N.Lo)
{
divmod128by64(M.Hi, M.Lo, N.Lo, &Q->Lo, &R->Lo);
Q->Hi = 0;
R->Hi = 0;
return;
}
else
{
Q->Hi = M.Hi / N.Lo;
R->Hi = M.Hi % N.Lo;
divmod128by64(R->Hi, M.Lo, N.Lo, &Q->Lo, &R->Lo);
R->Hi = 0;
return;
}
}
else
{
size_t n = nlz64(N.Hi);
uint128 v1;
shiftleft128(N, n, &v1);
uint128 u1;
shiftright128(M, 1, &u1);
uint128 q1;
divmod128by64(u1.Hi, u1.Lo, v1.Hi, &q1.Hi, &q1.Lo);
q1.Hi = 0;
shiftright128(q1, 63 - n, &q1);
if ((q1.Hi | q1.Lo) != 0)
{
dec128(q1, &q1);
}
Q->Hi = q1.Hi;
Q->Lo = q1.Lo;
mult128(q1, N, &q1);
sub128(M, q1, R);
if (compare128(*R, N) >= 0)
{
inc128(*Q, Q);
sub128(*R, N, R);
}
return;
}
}
void divmod128(uint128 M, uint128 N, uint128 * Q, uint128 * R)
{
size_t Nlz, Mlz, Ntz;
int C;
Nlz = nlz128(N);
Mlz = nlz128(M);
Ntz = ntz128(N);
if(Nlz == 128)
{
return;
}
else if((M.Hi | N.Hi) == 0)
{
Q->Hi = R->Hi = 0;
Q->Lo = M.Lo / N.Lo;
R->Lo = M.Lo % N.Lo;
return;
}
else if(Nlz == 127)
{
*Q = M;
R->Hi = R->Lo = 0;
return;
}
else if((Ntz + Nlz) == 127)
{
shiftright128(M, Ntz, Q);
dec128(N, &N);
and128(N, M, R);
return;
}
C = compare128(M, N);
if(C < 0)
{
Q->Hi = Q->Lo = 0;
*R = M;
return;
}
else if(C == 0)
{
Q->Hi = R->Hi = R->Lo = 0;
Q->Lo = 1;
return;
}
if((Nlz - Mlz) > 5)
{
divmod128by128(M, N, Q, R);
}
else
{
bindivmod128(M, N, Q, R);
}
}
#endif

180
crypto/mman.c

@ -0,0 +1,180 @@ @@ -0,0 +1,180 @@
#include <windows.h>
#include <errno.h>
#include <io.h>
#include "mman.h"
#ifndef FILE_MAP_EXECUTE
#define FILE_MAP_EXECUTE 0x0020
#endif /* FILE_MAP_EXECUTE */
static int __map_mman_error(const DWORD err, const int deferr)
{
if (err == 0)
return 0;
//TODO: implement
return err;
}
static DWORD __map_mmap_prot_page(const int prot)
{
DWORD protect = 0;
if (prot == PROT_NONE)
return protect;
if ((prot & PROT_EXEC) != 0)
{
protect = ((prot & PROT_WRITE) != 0) ?
PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
}
else
{
protect = ((prot & PROT_WRITE) != 0) ?
PAGE_READWRITE : PAGE_READONLY;
}
return protect;
}
static DWORD __map_mmap_prot_file(const int prot)
{
DWORD desiredAccess = 0;
if (prot == PROT_NONE)
return desiredAccess;
if ((prot & PROT_READ) != 0)
desiredAccess |= FILE_MAP_READ;
if ((prot & PROT_WRITE) != 0)
desiredAccess |= FILE_MAP_WRITE;
if ((prot & PROT_EXEC) != 0)
desiredAccess |= FILE_MAP_EXECUTE;
return desiredAccess;
}
void* mmap(void *addr, size_t len, int prot, int flags, int fildes, OffsetType off)
{
HANDLE fm, h;
void * map = MAP_FAILED;
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4293)
#endif
const DWORD dwFileOffsetLow = (sizeof(OffsetType) <= sizeof(DWORD)) ?
(DWORD)off : (DWORD)(off & 0xFFFFFFFFL);
const DWORD dwFileOffsetHigh = (sizeof(OffsetType) <= sizeof(DWORD)) ?
(DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL);
const DWORD protect = __map_mmap_prot_page(prot);
const DWORD desiredAccess = __map_mmap_prot_file(prot);
const OffsetType maxSize = off + (OffsetType)len;
const DWORD dwMaxSizeLow = (sizeof(OffsetType) <= sizeof(DWORD)) ?
(DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL);
const DWORD dwMaxSizeHigh = (sizeof(OffsetType) <= sizeof(DWORD)) ?
(DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);
#ifdef _MSC_VER
#pragma warning(pop)
#endif
errno = 0;
if (len == 0
/* Unsupported flag combinations */
|| (flags & MAP_FIXED) != 0
/* Usupported protection combinations */
|| prot == PROT_EXEC)
{
errno = EINVAL;
return MAP_FAILED;
}
h = ((flags & MAP_ANONYMOUS) == 0) ?
(HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE;
if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE)
{
errno = EBADF;
return MAP_FAILED;
}
fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);
if (fm == NULL)
{
errno = __map_mman_error(GetLastError(), EPERM);
return MAP_FAILED;
}
map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);
CloseHandle(fm);
if (map == NULL)
{
errno = __map_mman_error(GetLastError(), EPERM);
return MAP_FAILED;
}
return map;
}
int munmap(void *addr, size_t len)
{
if (UnmapViewOfFile(addr))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
int _mprotect(void *addr, size_t len, int prot)
{
DWORD newProtect = __map_mmap_prot_page(prot);
DWORD oldProtect = 0;
if (VirtualProtect(addr, len, newProtect, &oldProtect))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
int msync(void *addr, size_t len, int flags)
{
if (FlushViewOfFile(addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
int mlock(const void *addr, size_t len)
{
if (VirtualLock((LPVOID)addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
int munlock(const void *addr, size_t len)
{
if (VirtualUnlock((LPVOID)addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}

63
crypto/mman.h

@ -0,0 +1,63 @@ @@ -0,0 +1,63 @@
/*
* sys/mman.h
* mman-win32
*/
#ifndef _SYS_MMAN_H_
#define _SYS_MMAN_H_
#ifndef _WIN32_WINNT // Allow use of features specific to Windows XP or later.
#define _WIN32_WINNT 0x0501 // Change this to the appropriate value to target other versions of Windows.
#endif
/* All the headers include this file. */
#ifndef _MSC_VER
#include <_mingw.h>
#endif
/* Determine offset type */
#include <stdint.h>
#if defined(_WIN64)
typedef int64_t OffsetType;
#else
typedef uint32_t OffsetType;
#endif
#include <sys/types.h>
#ifdef __cplusplus
extern "C" {
#endif
#define PROT_NONE 0
#define PROT_READ 1
#define PROT_WRITE 2
#define PROT_EXEC 4
#define MAP_FILE 0
#define MAP_SHARED 1
#define MAP_PRIVATE 2
#define MAP_TYPE 0xf
#define MAP_FIXED 0x10
#define MAP_ANONYMOUS 0x20
#define MAP_ANON MAP_ANONYMOUS
#define MAP_FAILED ((void *)-1)
/* Flags for msync. */
#define MS_ASYNC 1
#define MS_SYNC 2
#define MS_INVALIDATE 4
void* mmap(void *addr, size_t len, int prot, int flags, int fildes, OffsetType off);
int munmap(void *addr, size_t len);
int _mprotect(void *addr, size_t len, int prot);
int msync(void *addr, size_t len, int flags);
int mlock(const void *addr, size_t len);
int munlock(const void *addr, size_t len);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_MMAN_H_ */

371
crypto/wildkeccak-cpu.cpp

@ -0,0 +1,371 @@ @@ -0,0 +1,371 @@
// Memory-hard extension of keccak for PoW
// Copyright (c) 2012-2013 The Cryptonote developers
// Copyright (c) 2014 The Boolberry developers
// Distributed under the MIT/X11 software license, see the accompanying
// file COPYING or http://www.opensource.org/licenses/mit-license.php.
// Modified for CPUminer by Lucas Jones
// Adapted for ccminer by Tanguy Pruvot - 2016
#include <errno.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#ifdef _MSC_VER
#include <emmintrin.h>
#include <bignum.hpp>
#include "int128_c.h"
#else
#include <x86intrin.h>
#endif
#include <miner.h>
#include "xmr-rpc.h"
extern uint64_t* pscratchpad_buff;
struct reciprocal_value64 {
uint64_t m;
uint8_t sh1, sh2;
};
static inline int fls64(uint64_t x)
{
#if defined(_WIN64)
unsigned long bitpos = 0;
_BitScanReverse64(&bitpos, x);
return (int) (bitpos + 1);
#elif defined(WIN32)
unsigned long hipos = 0, bitpos = 0;
uint32_t hi = x >> 32;
_BitScanReverse(&hipos, hi);
if (!hipos) {
_BitScanReverse(&bitpos, (uint32_t) x);
}
return (int) hipos ? hipos + 33 : bitpos + 1;
#else
/*
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
* dest reg is undefined if x==0, but their CPU architect says its
* value is written to set it to the same as before.
*/
register long bitpos = -1;
asm("bsrq %1,%0" : "+r" (bitpos) : "rm" (x));
return bitpos + 1;
#endif
}
static inline struct reciprocal_value64 reciprocal_val64(uint64_t d)
{
struct reciprocal_value64 R;
int l;
l = fls64(d - 1);
#ifdef _MSC_VER
uint128 v1;
v1.Lo = (1ULL << l) - d;v1.Hi=0;
uint128 v2;
v2.Hi = 1; v2.Lo = 0;
uint128 v;
mult128(v1,v2,&v);
divmod128by64(v.Hi,v.Lo,d,&v.Hi,&v.Lo);
Increment(&v);
R.m = (uint64_t)v.Hi;
#else
__uint128_t m;
m = (((__uint128_t)1 << 64) * ((1ULL << l) - d));
m /= d;
++m;
R.m = (uint64_t)m;
#endif
R.sh1 = min(l, 1);
R.sh2 = max(l - 1, 0);
return R;
}
static inline uint64_t reciprocal_divide64(uint64_t a, struct reciprocal_value64 R)
{
#ifdef _MSC_VER
uint128 v;
mult64to128(a,R.m,&v.Hi,&v.Lo);
uint64_t t = v.Hi;
#else
uint64_t t = (uint64_t)(((__uint128_t)a * R.m) >> 64);
#endif
return (t + ((a - t) >> R.sh1)) >> R.sh2;
}
static inline uint64_t reciprocal_remainder64(uint64_t A, uint64_t B, struct reciprocal_value64 R)
{
uint64_t div, mod;
div = reciprocal_divide64(A, R);
mod = A - (uint64_t) (div * B);
if (mod >= B) mod -= B;
return mod;
}
//#define UNROLL_SCR_MIX
static inline uint64_t rotl641(uint64_t x) { return((x << 1) | (x >> 63)); }
static inline uint64_t rotl64_1(uint64_t x, uint64_t y) { return((x << y) | (x >> (64 - y))); }
static inline uint64_t rotl64_2(uint64_t x, uint64_t y) { return(rotl64_1((x >> 32) | (x << 32), y)); }
static inline uint64_t bitselect(uint64_t a, uint64_t b, uint64_t c) { return(a ^ (c & (b ^ a))); }
static inline void keccakf_mul(uint64_t *s)
{
uint64_t bc[5], t[5];
uint64_t tmp1, tmp2;
int i;
for(i = 0; i < 5; i++)
t[i] = s[i + 0] ^ s[i + 5] ^ s[i + 10] * s[i + 15] * s[i + 20];
bc[0] = t[0] ^ rotl641(t[2]);
bc[1] = t[1] ^ rotl641(t[3]);
bc[2] = t[2] ^ rotl641(t[4]);
bc[3] = t[3] ^ rotl641(t[0]);
bc[4] = t[4] ^ rotl641(t[1]);
tmp1 = s[1] ^ bc[0];
s[ 0] ^= bc[4];
s[ 1] = rotl64_1(s[ 6] ^ bc[0], 44);
s[ 6] = rotl64_1(s[ 9] ^ bc[3], 20);
s[ 9] = rotl64_1(s[22] ^ bc[1], 61);
s[22] = rotl64_1(s[14] ^ bc[3], 39);
s[14] = rotl64_1(s[20] ^ bc[4], 18);
s[20] = rotl64_1(s[ 2] ^ bc[1], 62);
s[ 2] = rotl64_1(s[12] ^ bc[1], 43);
s[12] = rotl64_1(s[13] ^ bc[2], 25);
s[13] = rotl64_1(s[19] ^ bc[3], 8);
s[19] = rotl64_1(s[23] ^ bc[2], 56);
s[23] = rotl64_1(s[15] ^ bc[4], 41);
s[15] = rotl64_1(s[ 4] ^ bc[3], 27);
s[ 4] = rotl64_1(s[24] ^ bc[3], 14);
s[24] = rotl64_1(s[21] ^ bc[0], 2);
s[21] = rotl64_1(s[ 8] ^ bc[2], 55);
s[ 8] = rotl64_1(s[16] ^ bc[0], 45);
s[16] = rotl64_1(s[ 5] ^ bc[4], 36);
s[ 5] = rotl64_1(s[ 3] ^ bc[2], 28);
s[ 3] = rotl64_1(s[18] ^ bc[2], 21);
s[18] = rotl64_1(s[17] ^ bc[1], 15);
s[17] = rotl64_1(s[11] ^ bc[0], 10);
s[11] = rotl64_1(s[ 7] ^ bc[1], 6);
s[ 7] = rotl64_1(s[10] ^ bc[4], 3);
s[10] = rotl64_1(tmp1, 1);
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
s[0] ^= 0x0000000000000001ULL;
}
static inline void keccakf_mul_last(uint64_t *s)
{
uint64_t bc[5], xormul[5];
uint64_t tmp1, tmp2;
int i;
for(i = 0; i < 5; i++)
xormul[i] = s[i + 0] ^ s[i + 5] ^ s[i + 10] * s[i + 15] * s[i + 20];
bc[0] = xormul[0] ^ rotl641(xormul[2]);
bc[1] = xormul[1] ^ rotl641(xormul[3]);
bc[2] = xormul[2] ^ rotl641(xormul[4]);
bc[3] = xormul[3] ^ rotl641(xormul[0]);
bc[4] = xormul[4] ^ rotl641(xormul[1]);
s[0] ^= bc[4];
s[1] = rotl64_2(s[6] ^ bc[0], 12);
s[2] = rotl64_2(s[12] ^ bc[1], 11);
s[4] = rotl64_1(s[24] ^ bc[3], 14);
s[3] = rotl64_1(s[18] ^ bc[2], 21);
tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]);
s[0] ^= 0x0000000000000001ULL;
}
struct reciprocal_value64 cached_recip;
static uint64_t cached_scr_size = 0;
static inline void scr_mix(uint64_t *st, uint64_t scr_size, struct reciprocal_value64 recip)
{
#define KK_MIXIN_SIZE 24
uint64_t _ALIGN(128) idx[KK_MIXIN_SIZE];
#ifdef _MSC_VER
#define pscr pscratchpad_buff
int x;
// non-optimized 64bit operations
for (x = 0; x < KK_MIXIN_SIZE; x++) {
idx[x] = reciprocal_remainder64(st[x], scr_size, recip) << 2;
}
if (idx[7] > scr_size*4) {
applog(LOG_WARNING, "Wrong remainder64 returned by the cpu hash %016llx > %016llx",
(unsigned long long) idx[7], (unsigned long long) scr_size*4);
return;
}
for(x = 0; x < KK_MIXIN_SIZE; x += 4) {
st[x + 0] ^= pscr[idx[x] + 0] ^ pscr[idx[x + 1] + 0] ^ pscr[idx[x + 2] + 0] ^ pscr[idx[x + 3] + 0];
st[x + 1] ^= pscr[idx[x] + 1] ^ pscr[idx[x + 1] + 1] ^ pscr[idx[x + 2] + 1] ^ pscr[idx[x + 3] + 1];
st[x + 2] ^= pscr[idx[x] + 2] ^ pscr[idx[x + 1] + 2] ^ pscr[idx[x + 2] + 2] ^ pscr[idx[x + 3] + 2];
st[x + 3] ^= pscr[idx[x] + 3] ^ pscr[idx[x + 1] + 3] ^ pscr[idx[x + 2] + 3] ^ pscr[idx[x + 3] + 3];
}
return;
#elif !defined(UNROLL_SCR_MIX)
#pragma GCC ivdep
for(int x = 0; x < 3; ++x)
{
__m128i *st0, *st1, *st2, *st3;
idx[0] = reciprocal_remainder64(st[(x << 3) + 0], scr_size, recip) << 2;
idx[1] = reciprocal_remainder64(st[(x << 3) + 1], scr_size, recip) << 2;
idx[2] = reciprocal_remainder64(st[(x << 3) + 2], scr_size, recip) << 2;
idx[3] = reciprocal_remainder64(st[(x << 3) + 3], scr_size, recip) << 2;
idx[4] = reciprocal_remainder64(st[(x << 3) + 4], scr_size, recip) << 2;
idx[5] = reciprocal_remainder64(st[(x << 3) + 5], scr_size, recip) << 2;
idx[6] = reciprocal_remainder64(st[(x << 3) + 6], scr_size, recip) << 2;
idx[7] = reciprocal_remainder64(st[(x << 3) + 7], scr_size, recip) << 2;
for(int y = 0; y < 8; y++) _mm_prefetch((const char*) (&pscratchpad_buff[idx[y]]), _MM_HINT_T1);
st0 = (__m128i *)&st[(x << 3) + 0];
st1 = (__m128i *)&st[(x << 3) + 2];
st2 = (__m128i *)&st[(x << 3) + 4];
st3 = (__m128i *)&st[(x << 3) + 6];
*st0 = _mm_xor_si128(*st0, *((__m128i *)&pscratchpad_buff[idx[0]]));
*st0 = _mm_xor_si128(*st0, *((__m128i *)&pscratchpad_buff[idx[1]]));
*st0 = _mm_xor_si128(*st0, *((__m128i *)&pscratchpad_buff[idx[2]]));
*st0 = _mm_xor_si128(*st0, *((__m128i *)&pscratchpad_buff[idx[3]]));
*st1 = _mm_xor_si128(*st1, *((__m128i *)&pscratchpad_buff[idx[0] + 2]));
*st1 = _mm_xor_si128(*st1, *((__m128i *)&pscratchpad_buff[idx[1] + 2]));
*st1 = _mm_xor_si128(*st1, *((__m128i *)&pscratchpad_buff[idx[2] + 2]));
*st1 = _mm_xor_si128(*st1, *((__m128i *)&pscratchpad_buff[idx[3] + 2]));
*st2 = _mm_xor_si128(*st2, *((__m128i *)&pscratchpad_buff[idx[4]]));
*st2 = _mm_xor_si128(*st2, *((__m128i *)&pscratchpad_buff[idx[5]]));
*st2 = _mm_xor_si128(*st2, *((__m128i *)&pscratchpad_buff[idx[6]]));
*st2 = _mm_xor_si128(*st2, *((__m128i *)&pscratchpad_buff[idx[7]]));
*st3 = _mm_xor_si128(*st3, *((__m128i *)&pscratchpad_buff[idx[4] + 2]));
*st3 = _mm_xor_si128(*st3, *((__m128i *)&pscratchpad_buff[idx[5] + 2]));
*st3 = _mm_xor_si128(*st3, *((__m128i *)&pscratchpad_buff[idx[6] + 2]));
*st3 = _mm_xor_si128(*st3, *((__m128i *)&pscratchpad_buff[idx[7] + 2]));
}
#else
#warning using AVX2 optimizations
idx[ 0] = reciprocal_remainder64(st[0], scr_size, recip) << 2;
idx[ 1] = reciprocal_remainder64(st[1], scr_size, recip) << 2;
idx[ 2] = reciprocal_remainder64(st[2], scr_size, recip) << 2;
idx[ 3] = reciprocal_remainder64(st[3], scr_size, recip) << 2;
idx[ 4] = reciprocal_remainder64(st[4], scr_size, recip) << 2;
idx[ 5] = reciprocal_remainder64(st[5], scr_size, recip) << 2;
idx[ 6] = reciprocal_remainder64(st[6], scr_size, recip) << 2;
idx[ 7] = reciprocal_remainder64(st[7], scr_size, recip) << 2;
for(int y = 0; y < 8; y++) _mm_prefetch(&pscratchpad_buff[idx[y]], _MM_HINT_T1);
idx[ 8] = reciprocal_remainder64(st[8], scr_size, recip) << 2;
idx[ 9] = reciprocal_remainder64(st[9], scr_size, recip) << 2;
idx[10] = reciprocal_remainder64(st[10], scr_size, recip) << 2;
idx[11] = reciprocal_remainder64(st[11], scr_size, recip) << 2;
idx[12] = reciprocal_remainder64(st[12], scr_size, recip) << 2;
idx[13] = reciprocal_remainder64(st[13], scr_size, recip) << 2;
idx[14] = reciprocal_remainder64(st[14], scr_size, recip) << 2;
idx[15] = reciprocal_remainder64(st[15], scr_size, recip) << 2;
for(int y = 8; y < 16; ++y) _mm_prefetch(&pscratchpad_buff[idx[y]], _MM_HINT_T1);
idx[16] = reciprocal_remainder64(st[16], scr_size, recip) << 2;
idx[17] = reciprocal_remainder64(st[17], scr_size, recip) << 2;
idx[18] = reciprocal_remainder64(st[18], scr_size, recip) << 2;
idx[19] = reciprocal_remainder64(st[19], scr_size, recip) << 2;
idx[20] = reciprocal_remainder64(st[20], scr_size, recip) << 2;
idx[21] = reciprocal_remainder64(st[21], scr_size, recip) << 2;
idx[22] = reciprocal_remainder64(st[22], scr_size, recip) << 2;
idx[23] = reciprocal_remainder64(st[23], scr_size, recip) << 2;
for(int y = 16; y < 24; ++y) _mm_prefetch(&pscratchpad_buff[idx[y]], _MM_HINT_T1);
__m256i *st0 = (__m256i *)&st[0];
for(int x = 0; x < 6; ++x)
{
*st0 = _mm256_xor_si256(*st0, *((__m256i *)&pscratchpad_buff[idx[(x << 2) + 0]]));
*st0 = _mm256_xor_si256(*st0, *((__m256i *)&pscratchpad_buff[idx[(x << 2) + 1]]));
*st0 = _mm256_xor_si256(*st0, *((__m256i *)&pscratchpad_buff[idx[(x << 2) + 2]]));
*st0 = _mm256_xor_si256(*st0, *((__m256i *)&pscratchpad_buff[idx[(x << 2) + 3]]));
++st0;
}
#endif
return;
}
static void wild_keccak_hash_dbl(uint8_t * __restrict md, const uint8_t * __restrict in)
{
uint64_t _ALIGN(32) st[25];
uint64_t scr_size, i;
struct reciprocal_value64 recip;
scr_size = scratchpad_size >> 2;
if (scr_size == cached_scr_size)
recip = cached_recip;
else {
cached_recip = recip = reciprocal_val64(scr_size);
cached_scr_size = scr_size;
}
// Wild Keccak #1
memcpy(st, in, 88);
st[10] = (st[10] & 0x00000000000000FFULL) | 0x0000000000000100ULL;
memset(&st[11], 0, 112);
st[16] |= 0x8000000000000000ULL;
for(i = 0; i < 23; i++) {
keccakf_mul(st);
scr_mix(st, scr_size, recip);
}
keccakf_mul_last(st);
// Wild Keccak #2
memset(&st[4], 0x00, 168);
st[ 4] = 0x0000000000000001ULL;
st[16] = 0x8000000000000000ULL;
for(i = 0; i < 23; i++) {
keccakf_mul(st);
scr_mix(st, scr_size, recip);
}
keccakf_mul_last(st);
memcpy(md, st, 32);
return;
}
void wildkeccak_hash(void* output, const void* input, uint64_t* scratchpad, uint64_t ssize)
{
if (scratchpad) pscratchpad_buff = scratchpad;
if (!scratchpad_size) scratchpad_size = ssize;
wild_keccak_hash_dbl((uint8_t*)output, (uint8_t*)input);
}

369
crypto/wildkeccak.cu

@ -0,0 +1,369 @@ @@ -0,0 +1,369 @@
extern "C" {
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
}
#include <miner.h>
#include <cuda_helper.h>
#include <cuda_vector_uint2x4.h> // todo
#include "wildkeccak.h"
extern char *device_config[MAX_GPUS]; // -l
extern uint64_t* pscratchpad_buff;
static uint64_t* d_input[MAX_GPUS];
static uint32_t* d_retnonce[MAX_GPUS];
static ulonglong4* d_scratchpad[MAX_GPUS];
static uint64_t* h_scratchpad[MAX_GPUS] = { 0 };
static cudaStream_t bufpad_stream[MAX_GPUS] = { 0 };
static cudaStream_t kernel_stream[MAX_GPUS] = { 0 };
uint64_t scratchpad_size = 0;
uint32_t WK_CUDABlocks = 64;
uint32_t WK_CUDAThreads = 256;
#define st0 vst0.x
#define st1 vst0.y
#define st2 vst0.z
#define st3 vst0.w
#define st4 vst4.x
#define st5 vst4.y
#define st6 vst4.z
#define st7 vst4.w
#define st8 vst8.x
#define st9 vst8.y
#define st10 vst8.z
#define st11 vst8.w
#define st12 vst12.x
#define st13 vst12.y
#define st14 vst12.z
#define st15 vst12.w
#define st16 vst16.x
#define st17 vst16.y
#define st18 vst16.z
#define st19 vst16.w
#define st20 vst20.x
#define st21 vst20.y
#define st22 vst20.z
#define st23 vst20.w
#if __CUDA_ARCH__ >= 320
__device__ __forceinline__ uint64_t cuda_rotl641(const uint64_t value)
{
uint2 result;
asm("shf.l.wrap.b32 %0, %1, %2, 1U;" : "=r"(result.x)
: "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))));
asm("shf.l.wrap.b32 %0, %1, %2, 1U;" : "=r"(result.y)
: "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))));
return __double_as_longlong(__hiloint2double(result.y, result.x));
}
#else
__noinline__ __device__ uint64_t cuda_rotl641(const uint64_t x) { return((x << 1) | (x >> 63)); }
#endif
__noinline__ __device__ uint64_t bitselect(const uint64_t a, const uint64_t b, const uint64_t c) { return(a ^ (c & (b ^ a))); }
#define ROTL641(x) (cuda_rotl641(x))
#define RND() \
bc[0] = st0 ^ st5 ^ st10 * st15 * st20 ^ ROTL641(st2 ^ st7 ^ st12 * st17 * st22); \
bc[1] = st1 ^ st6 ^ st11 * st16 * st21 ^ ROTL641(st3 ^ st8 ^ st13 * st18 * st23); \
bc[2] = st2 ^ st7 ^ st12 * st17 * st22 ^ ROTL641(st4 ^ st9 ^ st14 * st19 * st24); \
bc[3] = st3 ^ st8 ^ st13 * st18 * st23 ^ ROTL641(st0 ^ st5 ^ st10 * st15 * st20); \
bc[4] = st4 ^ st9 ^ st14 * st19 * st24 ^ ROTL641(st1 ^ st6 ^ st11 * st16 * st21); \
tmp1 = st1 ^ bc[0]; \
\
st0 ^= bc[4]; \
st1 = ROTL64(st6 ^ bc[0], 44); \
st6 = ROTL64(st9 ^ bc[3], 20); \
st9 = ROTL64(st22 ^ bc[1], 61); \
st22 = ROTL64(st14 ^ bc[3], 39); \
st14 = ROTL64(st20 ^ bc[4], 18); \
st20 = ROTL64(st2 ^ bc[1], 62); \
st2 = ROTL64(st12 ^ bc[1], 43); \
st12 = ROTL64(st13 ^ bc[2], 25); \
st13 = ROTL64(st19 ^ bc[3], 8); \
st19 = ROTL64(st23 ^ bc[2], 56); \
st23 = ROTL64(st15 ^ bc[4], 41); \
st15 = ROTL64(st4 ^ bc[3], 27); \
st4 = ROTL64(st24 ^ bc[3], 14); \
st24 = ROTL64(st21 ^ bc[0], 2); \
st21 = ROTL64(st8 ^ bc[2], 55); \
st8 = ROTL64(st16 ^ bc[0], 45); \
st16 = ROTL64(st5 ^ bc[4], 36); \
st5 = ROTL64(st3 ^ bc[2], 28); \
st3 = ROTL64(st18 ^ bc[2], 21); \
st18 = ROTL64(st17 ^ bc[1], 15); \
st17 = ROTL64(st11 ^ bc[0], 10); \
st11 = ROTL64(st7 ^ bc[1], 6); \
st7 = ROTL64(st10 ^ bc[4], 3); \
st10 = ROTL641(tmp1); \
\
tmp1 = st0; tmp2 = st1; st0 = bitselect(st0 ^ st2, st0, st1); st1 = bitselect(st1 ^ st3, st1, st2); st2 = bitselect(st2 ^ st4, st2, st3); st3 = bitselect(st3 ^ tmp1, st3, st4); st4 = bitselect(st4 ^ tmp2, st4, tmp1); \
tmp1 = st5; tmp2 = st6; st5 = bitselect(st5 ^ st7, st5, st6); st6 = bitselect(st6 ^ st8, st6, st7); st7 = bitselect(st7 ^ st9, st7, st8); st8 = bitselect(st8 ^ tmp1, st8, st9); st9 = bitselect(st9 ^ tmp2, st9, tmp1); \
tmp1 = st10; tmp2 = st11; st10 = bitselect(st10 ^ st12, st10, st11); st11 = bitselect(st11 ^ st13, st11, st12); st12 = bitselect(st12 ^ st14, st12, st13); st13 = bitselect(st13 ^ tmp1, st13, st14); st14 = bitselect(st14 ^ tmp2, st14, tmp1); \
tmp1 = st15; tmp2 = st16; st15 = bitselect(st15 ^ st17, st15, st16); st16 = bitselect(st16 ^ st18, st16, st17); st17 = bitselect(st17 ^ st19, st17, st18); st18 = bitselect(st18 ^ tmp1, st18, st19); st19 = bitselect(st19 ^ tmp2, st19, tmp1); \
tmp1 = st20; tmp2 = st21; st20 = bitselect(st20 ^ st22, st20, st21); st21 = bitselect(st21 ^ st23, st21, st22); st22 = bitselect(st22 ^ st24, st22, st23); st23 = bitselect(st23 ^ tmp1, st23, st24); st24 = bitselect(st24 ^ tmp2, st24, tmp1); \
st0 ^= 1;
#define LASTRND1() \
bc[0] = st0 ^ st5 ^ st10 * st15 * st20 ^ ROTL64(st2 ^ st7 ^ st12 * st17 * st22, 1); \
bc[1] = st1 ^ st6 ^ st11 * st16 * st21 ^ ROTL64(st3 ^ st8 ^ st13 * st18 * st23, 1); \
bc[2] = st2 ^ st7 ^ st12 * st17 * st22 ^ ROTL64(st4 ^ st9 ^ st14 * st19 * st24, 1); \
bc[3] = st3 ^ st8 ^ st13 * st18 * st23 ^ ROTL64(st0 ^ st5 ^ st10 * st15 * st20, 1); \
bc[4] = st4 ^ st9 ^ st14 * st19 * st24 ^ ROTL64(st1 ^ st6 ^ st11 * st16 * st21, 1); \
\
st0 ^= bc[4]; \
st1 = ROTL64(st6 ^ bc[0], 44); \
st2 = ROTL64(st12 ^ bc[1], 43); \
st4 = ROTL64(st24 ^ bc[3], 14); \
st3 = ROTL64(st18 ^ bc[2], 21); \
\
tmp1 = st0; st0 = bitselect(st0 ^ st2, st0, st1); st1 = bitselect(st1 ^ st3, st1, st2); st2 = bitselect(st2 ^ st4, st2, st3); st3 = bitselect(st3 ^ tmp1, st3, st4); \
st0 ^= 1;
#define LASTRND2() \
bc[2] = st2 ^ st7 ^ st12 * st17 * st22 ^ ROTL64(st4 ^ st9 ^ st14 * st19 * st24, 1); \
bc[3] = st3 ^ st8 ^ st13 * st18 * st23 ^ ROTL64(st0 ^ st5 ^ st10 * st15 * st20, 1); \
bc[4] = st4 ^ st9 ^ st14 * st19 * st24 ^ ROTL64(st1 ^ st6 ^ st11 * st16 * st21, 1); \
\
st0 ^= bc[4]; \
st4 = ROTL64(st24 ^ bc[3], 14); \
st3 = ROTL64(st18 ^ bc[2], 21); \
st3 = bitselect(st3 ^ st0, st3, st4);
__device__ ulonglong4 operator^(const ulonglong4 &a, const ulonglong4 &b)
{
return(make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w));
}
#define MIX(vst) vst = vst ^ scratchpad[vst.x % scr_size] ^ scratchpad[vst.y % scr_size] ^ scratchpad[vst.z % scr_size] ^ scratchpad[vst.w % scr_size];
#define MIX_ALL MIX(vst0); MIX(vst4); MIX(vst8); MIX(vst12); MIX(vst16); MIX(vst20);
__global__
void wk(uint32_t* __restrict__ retnonce, const uint64_t* __restrict__ input, const ulonglong4* __restrict__ scratchpad,
const uint32_t scr_size, const uint32_t target, uint64_t startNonce)
{
ulonglong4 vst0, vst4, vst8, vst12, vst16, vst20;
uint64_t bc[5];
uint64_t st24, tmp1, tmp2;
const uint64_t nonce = startNonce + (blockDim.x * blockIdx.x) + threadIdx.x;
vst0 = make_ulonglong4((nonce << 8) + (input[0] & 0xFF), input[1] & 0xFFFFFFFFFFFFFF00ULL, input[2], input[3]);
vst4 = make_ulonglong4(input[4], input[5], input[6], input[7]);
vst8 = make_ulonglong4(input[8], input[9], (input[10] & 0xFF) | 0x100, 0);
vst12 = make_ulonglong4(0, 0, 0, 0);
vst16 = make_ulonglong4(0x8000000000000000ULL, 0, 0, 0);
vst20 = make_ulonglong4(0, 0, 0, 0);
st24 = 0;
RND();
MIX_ALL;
for(int i = 0; i < 22; i++) {
RND();
MIX_ALL;
}
LASTRND1();
vst4 = make_ulonglong4(1, 0, 0, 0);
vst8 = make_ulonglong4(0, 0, 0, 0);
vst12 = make_ulonglong4(0, 0, 0, 0);
vst16 = make_ulonglong4(0x8000000000000000ULL, 0, 0, 0);
vst20 = make_ulonglong4(0, 0, 0, 0);
st24 = 0;
RND();
MIX_ALL;
#pragma unroll
for(int i = 0; i < 22; i++) {
RND();
MIX_ALL;
}
LASTRND2();
if((st3 >> 32) <= target) {
retnonce[0] = (uint32_t) nonce;
retnonce[1] = retnonce[0];
}
}
__host__
void wildkeccak_kernel(const int thr_id, const uint32_t threads, const uint32_t startNounce, const uint2 target, uint32_t *resNonces)
{
CUDA_SAFE_CALL(cudaMemsetAsync(d_retnonce[thr_id], 0xff, 2 * sizeof(uint32_t), kernel_stream[thr_id]));
const uint32_t threadsperblock = WK_CUDAThreads;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
wk <<<grid, block, 0, kernel_stream[thr_id]>>> (d_retnonce[thr_id], d_input[thr_id], d_scratchpad[thr_id],
(uint32_t)(scratchpad_size >> 2), target.y, startNounce);
cudaMemcpyAsync(resNonces, d_retnonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost, kernel_stream[thr_id]);
}
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
//uint32_t _ALIGN(64) endiandata[20];
uint32_t *ptarget = work->target;
uint32_t throughput = 0;
uint64_t n, nonce, first;
uint8_t *pdata = (uint8_t*) work->data;
memcpy(&first, &pdata[1], 8);
//memcpy(&n, &pdata[1], 4);;
n = nonce = first;
// pdata[5] = thr_id;
// memcpy(&nonce, &pdata[1], 8);
if (!scratchpad_size || !h_scratchpad[thr_id]) {
if (h_scratchpad[thr_id])
applog(LOG_ERR, "Scratchpad size is not set!");
work->data[0] = 0; // invalidate
sleep(1);
return -EBUSY;
}
if (!init[thr_id]) {
if (device_config[thr_id]) {
sscanf(device_config[thr_id], "%ux%u", &WK_CUDABlocks, &WK_CUDAThreads);
gpulog(LOG_INFO, thr_id, "Using %u x %u threads kernel launch config", WK_CUDABlocks, WK_CUDAThreads);
} else {
throughput = cuda_default_throughput(thr_id, WK_CUDABlocks*WK_CUDAThreads);
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
}
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage (linux)
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
CUDA_LOG_ERROR();
}
CUDA_SAFE_CALL(cudaMalloc(&d_input[thr_id], 88));
CUDA_SAFE_CALL(cudaMalloc(&d_retnonce[thr_id], 2*sizeof(uint32_t)));
int status = (int) cudaMalloc(&d_scratchpad[thr_id], WILD_KECCAK_SCRATCHPAD_BUFFSIZE);
if (status != cudaSuccess) {
gpulog(LOG_ERR, thr_id, "Unable to allocate device memory, %u MB, err %d",
(uint32_t) (WILD_KECCAK_SCRATCHPAD_BUFFSIZE/(1024*1024)), status);
exit(-ENOMEM);
}
cudaStreamCreate(&bufpad_stream[thr_id]);
cudaStreamCreate(&kernel_stream[thr_id]);
CUDA_SAFE_CALL(cudaMemcpyAsync(d_scratchpad[thr_id], h_scratchpad[thr_id], scratchpad_size << 3, cudaMemcpyHostToDevice, bufpad_stream[thr_id]));
init[thr_id] = true;
}
throughput = cuda_default_throughput(thr_id, WK_CUDABlocks*WK_CUDAThreads);
cudaMemcpy(d_input[thr_id], pdata, 88, cudaMemcpyHostToDevice);
// cudaMemset(d_retnonce[thr_id], 0xFF, 2*sizeof(uint32_t));
if (h_scratchpad[thr_id]) {
cudaStreamSynchronize(bufpad_stream[thr_id]);
}
do {
// const uint32_t blocks = WK_CUDABlocks, threads = WK_CUDAThreads;
// const dim3 block(blocks);
// const dim3 thread(threads);
uint32_t h_retnonce[2] = { UINT32_MAX, UINT32_MAX };
uint2 target = make_uint2(ptarget[6], ptarget[7]);
wildkeccak_kernel(thr_id, throughput, nonce, target, h_retnonce);
/*
wk <<<block, thread, 0, kernel_stream[thr_id]>>> (d_retnonce[thr_id], d_input[thr_id], d_scratchpad[thr_id],
(uint32_t)(scratchpad_size >> 2), nonce, ptarget[7]);
*/
*hashes_done = (unsigned long) (n - first + throughput);
cudaStreamSynchronize(kernel_stream[thr_id]);
if(h_retnonce[0] != UINT32_MAX) {
uint8_t _ALIGN(64) cpuhash[32];
uint32_t* vhash = (uint32_t*) cpuhash;
uint64_t nonce64;
memcpy(&pdata[1], &h_retnonce[0], sizeof(uint32_t));
memcpy(&nonce64, &pdata[1], 8);
wildkeccak_hash(cpuhash, pdata, pscratchpad_buff, scratchpad_size);
if (!cpuhash[31] && vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work_set_target_ratio(work, vhash);
//applog_hex(pdata, 84);
//applog_hex(cpuhash, 32);
//applog_hex(ptarget, 32);
memcpy(work->nonces, &nonce64, 8);
if (n + throughput > max_nonce) {
*hashes_done = (unsigned long) (max_nonce - first);
}
return 1;
}
}
if (n + throughput >= max_nonce) {
n = max_nonce;
break;
}
n += throughput;
nonce += throughput;
} while(!work_restart[thr_id].restart);
*hashes_done = (unsigned long) (n - first + 1);
return 0;
}
void wildkeccak_scratchpad_need_update(uint64_t* pscratchpad_buff)
{
for(int i = 0; i < opt_n_threads; i++) {
h_scratchpad[i] = pscratchpad_buff;
if (init[i]) {
gpulog(LOG_DEBUG, i, "Starting scratchpad update...");
cudaMemcpyAsync(d_scratchpad[i], h_scratchpad[i], scratchpad_size << 3, cudaMemcpyHostToDevice, bufpad_stream[i]);
work_restart[i].restart = true;
}
}
}
void free_wildkeccak(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_scratchpad[thr_id]);
cudaFree(d_input[thr_id]);
cudaFree(d_retnonce[thr_id]);
cudaStreamDestroy(bufpad_stream[thr_id]);
cudaStreamDestroy(kernel_stream[thr_id]);
cudaDeviceSynchronize();
init[thr_id] = false;
}

10
crypto/wildkeccak.h

@ -0,0 +1,10 @@ @@ -0,0 +1,10 @@
#define WILD_KECCAK_SCRATCHPAD_BUFFSIZE 1ULL << 28
#define WILD_KECCAK_ADDENDUMS_ARRAY_SIZE 10
extern uint64_t scratchpad_size;
extern uint32_t WK_CUDABlocks, WK_CUDAThreads;
void wildkeccak_scratchpad_need_update(uint64_t* pscratchpad_buff);

1257
crypto/xmr-rpc.cpp

File diff suppressed because it is too large Load Diff

41
crypto/xmr-rpc.h

@ -0,0 +1,41 @@ @@ -0,0 +1,41 @@
#include <jansson.h>
#include "wildkeccak.h"
#ifdef WIN32
#define _PACKED _ALIGN(4)
#else
#define _PACKED __attribute__((__packed__))
#endif
struct _PACKED scratchpad_hi {
unsigned char prevhash[32];
uint64_t height;
};
struct _PACKED addendums_array_entry {
struct scratchpad_hi prev_hi;
uint64_t add_size;
};
struct _PACKED scratchpad_file_header {
struct scratchpad_hi current_hi;
struct addendums_array_entry add_arr[WILD_KECCAK_ADDENDUMS_ARRAY_SIZE];
uint64_t scratchpad_size;
};
bool rpc2_job_decode(const json_t *job, struct work *work);
bool rpc2_stratum_job(struct stratum_ctx *sctx, json_t *id, json_t *params);
bool rpc2_stratum_gen_work(struct stratum_ctx *sctx, struct work *work);
bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work);
int rpc2_stratum_thread_stuff(struct pool_infos* pool);
bool rpc2_login_decode(const json_t *val);
void rpc2_init();
void GetScratchpad();

6
miner.h

@ -293,6 +293,7 @@ extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsign @@ -293,6 +293,7 @@ extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsign
extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds);
extern int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -340,6 +341,7 @@ extern void free_s3(int thr_id); @@ -340,6 +341,7 @@ extern void free_s3(int thr_id);
extern void free_vanilla(int thr_id);
extern void free_veltor(int thr_id);
extern void free_whirl(int thr_id);
extern void free_wildkeccak(int thr_id);
extern void free_x11evo(int thr_id);
extern void free_x11(int thr_id);
extern void free_x13(int thr_id);
@ -622,6 +624,7 @@ struct stratum_ctx { @@ -622,6 +624,7 @@ struct stratum_ctx {
int pooln;
time_t tm_connected;
int rpc2;
int srvtime_diff;
};
@ -743,6 +746,8 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p @@ -743,6 +746,8 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
bool stratum_handle_method(struct stratum_ctx *sctx, const char *s);
void stratum_free_job(struct stratum_ctx *sctx);
bool rpc2_stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass);
void hashlog_remember_submit(struct work* work, uint32_t nonce);
void hashlog_remember_scan_range(struct work* work);
uint32_t hashlog_already_submittted(char* jobid, uint32_t nounce);
@ -831,6 +836,7 @@ void x13hash(void *output, const void *input); @@ -831,6 +836,7 @@ void x13hash(void *output, const void *input);
void x14hash(void *output, const void *input);
void x15hash(void *output, const void *input);
void x17hash(void *output, const void *input);
void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
void zr5hash(void *output, const void *input);
void zr5hash_pok(void *output, uint32_t *pdata);

4
pools.cpp

@ -254,6 +254,7 @@ bool pool_switch(int thr_id, int pooln) @@ -254,6 +254,7 @@ bool pool_switch(int thr_id, int pooln)
// temporary... until stratum code cleanup
stratum = p->stratum;
stratum.pooln = cur_pooln;
stratum.rpc2 = (p->algo == ALGO_WILDKECCAK);
// unlock the stratum thread
tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url));
@ -275,6 +276,9 @@ bool pool_switch(int thr_id, int pooln) @@ -275,6 +276,9 @@ bool pool_switch(int thr_id, int pooln)
}
}
stratum.rpc2 = (p->algo == ALGO_WILDKECCAK);
return true;
}

8
res/ccminer.rc

@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 1,8,4,0
PRODUCTVERSION 1,8,4,0
FILEVERSION 2,0,0,0
PRODUCTVERSION 2,0,0,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN @@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
VALUE "FileVersion", "1.8.4"
VALUE "FileVersion", "2.0"
VALUE "LegalCopyright", "Copyright (C) 2016"
VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "1.8.4"
VALUE "ProductVersion", "2.0"
END
END
BLOCK "VarFileInfo"

11
util.cpp

@ -36,6 +36,8 @@ @@ -36,6 +36,8 @@
#include "miner.h"
#include "elist.h"
#include "crypto/xmr-rpc.h"
extern pthread_mutex_t stratum_sock_lock;
extern pthread_mutex_t stratum_work_lock;
extern bool opt_debug_diff;
@ -1216,6 +1218,8 @@ bool stratum_subscribe(struct stratum_ctx *sctx) @@ -1216,6 +1218,8 @@ bool stratum_subscribe(struct stratum_ctx *sctx)
json_error_t err;
bool ret = false, retry = false;
if (sctx->rpc2) return true;
start:
s = (char*)malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0));
if (retry)
@ -1307,6 +1311,9 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p @@ -1307,6 +1311,9 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
json_error_t err;
bool ret = false;
if (sctx->rpc2)
return rpc2_stratum_authorize(sctx, user, pass);
s = (char*)malloc(80 + strlen(user) + strlen(pass));
sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}",
user, pass);
@ -1849,6 +1856,10 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) @@ -1849,6 +1856,10 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s)
ret = stratum_show_message(sctx, id, params);
goto out;
}
if (sctx->rpc2 && !strcasecmp(method, "job")) { // cryptonote
ret = rpc2_stratum_job(sctx, id, params);
goto out;
}
if (!ret) {
// don't fail = disconnect stratum on unknown (and optional?) methods

Loading…
Cancel
Save